mark_as_advanced(GMX_FAHCORE)
include(gmxDetectAcceleration)
-if(NOT DEFINED GMX_ACCELERATION)
+if(NOT DEFINED GMX_CPU_ACCELERATION)
if(CMAKE_CROSSCOMPILING)
- set(GMX_SUGGESTED_ACCELERATION "None")
+ set(GMX_SUGGESTED_CPU_ACCELERATION "None")
else(CMAKE_CROSSCOMPILING)
- gmx_detect_acceleration(GMX_SUGGESTED_ACCELERATION)
+ gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
endif(CMAKE_CROSSCOMPILING)
-endif(NOT DEFINED GMX_ACCELERATION)
+endif(NOT DEFINED GMX_CPU_ACCELERATION)
-set(GMX_ACCELERATION "@GMX_SUGGESTED_ACCELERATION@"
- CACHE STRING "Accelerated kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene, Power6, Fortran")
+set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
+ CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene, Power6, Fortran")
set(GMX_FFT_LIBRARY "fftw3"
CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
set(GMX_SOFTWARE_INVSQRT OFF CACHE STRING
"The OpenMM build does not need GROMACS software 1/sqrt!" FORCE)
endif(GMX_SOFTWARE_INVSQRT)
- string(TOUPPER ${GMX_ACCELERATION} GMX_ACCELERATION)
- if(NOT GMX_ACCELERATION STREQUAL "NONE")
- message(STATUS "Switching off CPU-based acceleration, the OpenMM build does not support/need any!")
- set(GMX_ACCELERATION "none" CACHE STRING
- "Switching off CPU-based acceleration, the OpenMM build does not support/need any!" FORCE)
+ string(TOUPPER ${GMX_CPU_ACCELERATION} GMX_CPU_ACCELERATION)
+ if(NOT GMX_CPU_ACCELERATION STREQUAL "NONE")
+ message(STATUS "Switching off CPU-based acceleration, the OpenMM build does not support/need any!")
+ set(GMX_CPU_ACCELERATION "None" CACHE STRING
+ "Switching off CPU-based acceleration, the OpenMM build does not support/need any!" FORCE)
endif()
if(GMX_FAHCORE)
message(FATAL_ERROR "The OpenMM build does not support FAH build!")
message(FATAL_ERROR "The OpenMM-build does not support double precision calculations!")
endif()
# mark as advanced the unused variables
- mark_as_advanced(FORCE GMX_ACCELERATION GMX_MPI GMX_FFT_LIBRARY
+ mark_as_advanced(FORCE GMX_CPU_ACCELERATION GMX_MPI GMX_FFT_LIBRARY
GMX_QMMM_PROGRAM GMX_THREAD_MPI GMX_DOUBLE)
else(GMX_OPENMM)
- mark_as_advanced(CLEAR GMX_ACCELERATION GMX_MPI GMX_FFT_LIBRARY
+ mark_as_advanced(CLEAR GMX_CPU_ACCELERATION GMX_MPI GMX_FFT_LIBRARY
GMX_QMMM_PROGRAM GMX_THREAD_MPI GMX_DOUBLE)
endif(GMX_OPENMM)
endif(NOT GMX_SYSTEM_XDR)
# Process nonbonded accelerated kernels settings
-string(TOUPPER ${GMX_ACCELERATION} ${GMX_ACCELERATION})
-if(${GMX_ACCELERATION} STREQUAL "NONE")
+string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
+if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
# nothing to do
-elseif(${GMX_ACCELERATION} STREQUAL "SSE2")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
if(NOT GNU_SSE2_CFLAG)
message(FATAL_ERROR "Cannot find emmintrin.h, which is required for SSE2 intrinsics support.")
endif(NOT HAVE_EMMINTRIN_H)
+ set(GMX_CPU_ACCELERATION_X86_SSE2 1)
# The user should not be able to set this orthogonally to the acceleration
set(GMX_X86_SSE2 1)
if (NOT ACCELERATION_QUIETLY)
message(STATUS "Enabling SSE2 Gromacs acceleration, and it will help compiler optimization.")
endif()
-elseif(${GMX_ACCELERATION} STREQUAL "SSE4.1")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" GROMACS_C_FLAGS)
if (NOT GNU_SSE4_CFLAG)
message(FATAL_ERROR "Cannot find smmintrin.h, which is required for SSE4.1 intrinsics support.")
endif(NOT HAVE_SMMINTRIN_H)
+ set(GMX_CPU_ACCELERATION_X86_SSE4_1 1)
# The user should not be able to set this orthogonally to the acceleration
set(GMX_X86_SSE4_1 1)
set(GMX_X86_SSE2 1)
message(STATUS "Enabling SSE4.1 Gromacs acceleration, and it will help compiler optimization.")
endif()
-elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUAL "AVX_256")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
# Set the AVX compiler flag for both these choices!
endif()
# Set the FMA4 flags (MSVC doesn't require any)
- if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
+ if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
if (NOT GNU_FMA_CFLAG)
message(WARNING "No C FMA4 flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
set(GMX_X86_SSE2 1)
# But just enable one of the choices internally...
- if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA")
+ if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA")
+ set(GMX_CPU_ACCELERATION_X86_AVX_128_FMA 1)
set(GMX_X86_AVX_128_FMA 1)
if (NOT ACCELERATION_QUIETLY)
message(STATUS "Enabling 128-bit AVX Gromacs acceleration (with fused-multiply add), and it will help compiler optimization.")
endif()
else()
# If we are not doing AVX_128, it must be AVX_256...
+ set(GMX_CPU_ACCELERATION_X86_AVX_256 1)
set(GMX_X86_AVX_256 1)
if (NOT ACCELERATION_QUIETLY)
message(STATUS "Enabling 256-bit AVX Gromacs acceleration, and it will help compiler optimization.")
endif()
endif()
-elseif(${GMX_ACCELERATION} STREQUAL "FORTRAN")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "FORTRAN")
# Fortran is temporarily disabled while we push in nbNxN kernels.
# We need to fake it a bit here to avoid jenkins build errors!
# add_definitions(-DGMX_FORTRAN)
-elseif(${GMX_ACCELERATION} STREQUAL "BLUEGENE")
-# GMX_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
+# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
if (NOT ACCELERATION_QUIETLY)
message(STATUS "Configuring for BlueGene")
endif()
# The automatic testing for endianness does not work for the BlueGene cross-compiler
set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_ACCELERATION} STREQUAL "POWER6")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
set(GMX_POWER6 1)
set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
-else(${GMX_ACCELERATION} STREQUAL "NONE")
- MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, Fortran, BlueGene, Power6")
-endif(${GMX_ACCELERATION} STREQUAL "NONE")
+else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
+ MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, Fortran, BlueGene, Power6")
+endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
if(GMX_FORTRAN OR GMX_POWER6)
set(GMX_FFT_FFTW3 1)
- if (NOT ${GMX_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD)
+ if (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD)
message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
- endif (NOT ${GMX_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD)
+ endif (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD)
elseif(${GMX_FFT_LIBRARY} STREQUAL "MKL")
# MESSAGE(STATUS "Using external FFT library - Intel MKL")
real *lambda,real *dvdlambda,
gmx_grppairener_t *grppener,
gmx_bool bFillGrid,
- gmx_bool bDoLongRangeNS);
+ gmx_bool bDoLongRangeNS,
+ gmx_bool bPadListsForKernels);
/* Debugging routines from wnblist.c */
void * kernelptr_vf;
void * kernelptr_v;
void * kernelptr_f;
+
+ /* Pad the list of neighbors for each i atom with "-1" entries up to the
+ * simd_padding_width, if it is larger than 0. This is necessary for many
+ * accelerated kernels using single-instruction multiple-data operations
+ * internally.
+ */
+ int simd_padding_width;
+
} t_nblist;
/* Use AMD core math library */
#cmakedefine GMX_FFT_ACML
-/* What type of acceleration is used? (string, for dumping to files) */
-#define GMX_ACCELERATION "@GMX_ACCELERATION@"
-
-/* SSE2 acceleration */
+/* SSE2 instructions available */
#cmakedefine GMX_X86_SSE2
-/* SSE4.1 acceleration */
+/* SSE4.1 instructions available */
#cmakedefine GMX_X86_SSE4_1
-/* AVX 128-bit acceleration with FMA, useful on modern AMD hardware */
+/* AVX 128-bit FMA instructions available */
#cmakedefine GMX_X86_AVX_128_FMA
-/* AVX 256-bit acceleration, usually for intel hardware */
+/* AVX 256-bit instructions available */
#cmakedefine GMX_X86_AVX_256
+/* SSE2 was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
+
+/* SSE4.1 was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_SSE4_1
+
+/* AVX 128-bit FMA was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_128_FMA
+
+/* AVX 256-bit was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_256
+
+/* String for CPU acceleration choice (for writing to log files and stdout) */
+#define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
+
/* Integer byte order is big endian. */
#cmakedefine GMX_INTEGER_BIG_ENDIAN
# Sources that should always be built
file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c)
+# For now we enable the (existing) SSE2 kernels for all accelerations
+if(((GMX_CPU_ACCELERATION STREQUAL "SSE2") OR
+ (GMX_CPU_ACCELERATION STREQUAL "SSE4.1") OR
+ (GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA") OR
+ (GMX_CPU_ACCELERATION STREQUAL "AVX_256")) AND NOT GMX_DOUBLE)
+ file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
+endif()
+
# These sources will be used in the parent directory's CMakeLists.txt
-set(NONBONDED_SOURCES ${NONBONDED_SOURCES} PARENT_SCOPE)
+set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} PARENT_SCOPE)
--- /dev/null
+/*
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2011-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction. Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _kernelutil_x86_sse2_single_h_
+#define _kernelutil_x86_sse2_single_h_
+
+/* We require SSE2 now! */
+
+#include <math.h>
+
+#include "gmx_x86_sse2.h"
+
+
+/* Normal sum of four xmm registers */
+#define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
+
+static gmx_inline __m128
+gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
+{
+ return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
+}
+
+static int
+gmx_mm_any_lt(__m128 a, __m128 b)
+{
+ return _mm_movemask_ps(_mm_cmplt_ps(a,b));
+}
+
+/* Load a single value from 1-4 places, merge into xmm register */
+
+static __m128
+gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
+ const float * gmx_restrict ptrB,
+ const float * gmx_restrict ptrC,
+ const float * gmx_restrict ptrD)
+{
+ __m128 t1,t2;
+
+ t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
+ t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
+ return _mm_unpacklo_ps(t1,t2);
+}
+
+static void
+gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
+ float * gmx_restrict ptrB,
+ float * gmx_restrict ptrC,
+ float * gmx_restrict ptrD,
+ __m128 xmm1)
+{
+ __m128 t2,t3,t4;
+
+ t3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+ t2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+ t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+ _mm_store_ss(ptrA,xmm1);
+ _mm_store_ss(ptrB,t2);
+ _mm_store_ss(ptrC,t3);
+ _mm_store_ss(ptrD,t4);
+}
+
+/* Similar to store, but increments value in memory */
+static void
+gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
+ float * gmx_restrict ptrB,
+ float * gmx_restrict ptrC,
+ float * gmx_restrict ptrD, __m128 xmm1)
+{
+ __m128 tmp;
+
+ tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
+ tmp = _mm_add_ps(tmp,xmm1);
+ gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
+}
+
+
+static void
+gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
+ const float * gmx_restrict p2,
+ const float * gmx_restrict p3,
+ const float * gmx_restrict p4,
+ __m128 * gmx_restrict c6,
+ __m128 * gmx_restrict c12)
+{
+ __m128 t1,t2,t3,t4;
+
+ t1 = _mm_castpd_ps(_mm_load_sd((const double *)p1));
+ t2 = _mm_castpd_ps(_mm_load_sd((const double *)p2));
+ t3 = _mm_castpd_ps(_mm_load_sd((const double *)p3));
+ t4 = _mm_castpd_ps(_mm_load_sd((const double *)p4));
+ t1 = _mm_unpacklo_ps(t1,t2);
+ t2 = _mm_unpacklo_ps(t3,t4);
+ *c6 = _mm_movelh_ps(t1,t2);
+ *c12 = _mm_movehl_ps(t2,t1);
+}
+
+/* Routines to load 1-4 rvec from 4 places.
+ * We mainly use these to load coordinates. The extra routines
+ * are very efficient for the water-water loops, since we e.g.
+ * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
+ */
+
+
+static void
+gmx_mm_load_1rvec_broadcast_ps(float *ptrA, __m128 *x, __m128 *y, __m128 *z)
+{
+ __m128 t1;
+
+ t1 = _mm_loadu_ps(ptrA);
+
+ *x = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+}
+
+static void
+gmx_mm_load_3rvec_broadcast_ps(float *ptrA,
+ __m128 *x1, __m128 *y1, __m128 *z1,
+ __m128 *x2, __m128 *y2, __m128 *z2,
+ __m128 *x3, __m128 *y3, __m128 *z3)
+{
+ __m128 t1,t2,t3;
+
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrA+4);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+ *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+ *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+ *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+ *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+
+ t3 = _mm_load_ss(ptrA+8);
+ *z3 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+}
+
+static void
+gmx_mm_load_4rvec_broadcast_ps(float *ptrA,
+ __m128 *x1, __m128 *y1, __m128 *z1,
+ __m128 *x2, __m128 *y2, __m128 *z2,
+ __m128 *x3, __m128 *y3, __m128 *z3,
+ __m128 *x4, __m128 *y4, __m128 *z4)
+{
+ __m128 t1,t2,t3;
+ __m128 tA;
+
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrA+4);
+ t3 = _mm_loadu_ps(ptrA+8);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+ *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+ *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+ *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+ *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+ *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
+ *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+ *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
+ *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
+}
+
+
+static void
+gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
+ const float * gmx_restrict ptrB,
+ const float * gmx_restrict ptrC,
+ const float * gmx_restrict ptrD,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8;
+ t1 = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
+ t2 = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
+ t3 = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
+ t4 = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
+ t5 = _mm_load_ss(ptrA+2);
+ t6 = _mm_load_ss(ptrB+2);
+ t7 = _mm_load_ss(ptrC+2);
+ t8 = _mm_load_ss(ptrD+2);
+ t1 = _mm_unpacklo_ps(t1,t2);
+ t3 = _mm_unpacklo_ps(t3,t4);
+ *x1 = _mm_movelh_ps(t1,t3);
+ *y1 = _mm_movehl_ps(t3,t1);
+ t5 = _mm_unpacklo_ps(t5,t6);
+ t7 = _mm_unpacklo_ps(t7,t8);
+ *z1 = _mm_movelh_ps(t5,t7);
+}
+
+
+static void
+gmx_mm_load_2rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 *x1, __m128 *y1, __m128 *z1,
+ __m128 *x2, __m128 *y2, __m128 *z2)
+{
+ __m128 t1,t2,t3,t4;
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrB);
+ t3 = _mm_loadu_ps(ptrC);
+ t4 = _mm_loadu_ps(ptrD);
+ _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+ *x1 = t1;
+ *y1 = t2;
+ *z1 = t3;
+ *x2 = t4;
+ t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrA+4));
+ t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrB+4));
+ t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrC+4));
+ t4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrD+4));
+ t1 = _mm_unpacklo_ps(t1,t3);
+ t2 = _mm_unpacklo_ps(t2,t4);
+ *y2 = _mm_unpacklo_ps(t1,t2);
+ *z2 = _mm_unpackhi_ps(t1,t2);
+}
+
+
+static void
+gmx_mm_load_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 *x1, __m128 *y1, __m128 *z1,
+ __m128 *x2, __m128 *y2, __m128 *z2,
+ __m128 *x3, __m128 *y3, __m128 *z3)
+{
+ __m128 t1,t2,t3,t4;
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrB);
+ t3 = _mm_loadu_ps(ptrC);
+ t4 = _mm_loadu_ps(ptrD);
+ _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+ *x1 = t1;
+ *y1 = t2;
+ *z1 = t3;
+ *x2 = t4;
+ t1 = _mm_loadu_ps(ptrA+4);
+ t2 = _mm_loadu_ps(ptrB+4);
+ t3 = _mm_loadu_ps(ptrC+4);
+ t4 = _mm_loadu_ps(ptrD+4);
+ _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+ *y2 = t1;
+ *z2 = t2;
+ *x3 = t3;
+ *y3 = t4;
+ t1 = _mm_load_ss(ptrA+8);
+ t2 = _mm_load_ss(ptrB+8);
+ t3 = _mm_load_ss(ptrC+8);
+ t4 = _mm_load_ss(ptrD+8);
+ t1 = _mm_unpacklo_ps(t1,t3);
+ t3 = _mm_unpacklo_ps(t2,t4);
+ *z3 = _mm_unpacklo_ps(t1,t3);
+}
+
+
+static void
+gmx_mm_load_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 *x1, __m128 *y1, __m128 *z1,
+ __m128 *x2, __m128 *y2, __m128 *z2,
+ __m128 *x3, __m128 *y3, __m128 *z3,
+ __m128 *x4, __m128 *y4, __m128 *z4)
+{
+ __m128 t1,t2,t3,t4;
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrB);
+ t3 = _mm_loadu_ps(ptrC);
+ t4 = _mm_loadu_ps(ptrD);
+ _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+ *x1 = t1;
+ *y1 = t2;
+ *z1 = t3;
+ *x2 = t4;
+ t1 = _mm_loadu_ps(ptrA+4);
+ t2 = _mm_loadu_ps(ptrB+4);
+ t3 = _mm_loadu_ps(ptrC+4);
+ t4 = _mm_loadu_ps(ptrD+4);
+ _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+ *y2 = t1;
+ *z2 = t2;
+ *x3 = t3;
+ *y3 = t4;
+ t1 = _mm_loadu_ps(ptrA+8);
+ t2 = _mm_loadu_ps(ptrB+8);
+ t3 = _mm_loadu_ps(ptrC+8);
+ t4 = _mm_loadu_ps(ptrD+8);
+ _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+ *z3 = t1;
+ *x4 = t2;
+ *y4 = t3;
+ *z4 = t4;
+}
+
+
+/* Routines to increment rvec in memory, typically use for j particle force updates */
+static void
+gmx_mm_increment_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
+{
+ __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
+ __m128 t1;
+
+ t1 = _mm_loadu_ps(ptrA);
+ xyz = _mm_and_ps(mask,xyz);
+ t1 = _mm_add_ps(t1,xyz);
+ _mm_storeu_ps(ptrA,t1);
+}
+
+
+static void
+gmx_mm_increment_3rvec_1ptr_noswizzle_ps(float *ptrA,
+ __m128 xyz1, __m128 xyz2, __m128 xyz3)
+{
+ __m128 t1,t2,t3,t4;
+ __m128 tA,tB,tC;
+
+ tA = _mm_loadu_ps(ptrA);
+ tB = _mm_loadu_ps(ptrA+4);
+ tC = _mm_load_ss(ptrA+8);
+
+ t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+ t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+ t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
+ t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+ t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+
+ tA = _mm_add_ps(tA,t3);
+ tB = _mm_add_ps(tB,t4);
+ tC = _mm_add_ss(tC,t2);
+
+ _mm_storeu_ps(ptrA,tA);
+ _mm_storeu_ps(ptrA+4,tB);
+ _mm_store_ss(ptrA+8,tC);
+}
+
+static void
+gmx_mm_increment_4rvec_1ptr_noswizzle_ps(float *ptrA,
+ __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
+{
+ __m128 t1,t2,t3,t4,t5;
+ __m128 tA,tB,tC;
+
+ tA = _mm_loadu_ps(ptrA);
+ tB = _mm_loadu_ps(ptrA+4);
+ tC = _mm_loadu_ps(ptrA+8);
+
+ t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+ t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+ t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
+ t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+ t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+ t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
+
+ t2 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0)); /* x4 x4 z3 z3 */
+ t5 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
+
+ tA = _mm_add_ps(tA,t3);
+ tB = _mm_add_ps(tB,t4);
+ tC = _mm_add_ps(tC,t5);
+
+ _mm_storeu_ps(ptrA,tA);
+ _mm_storeu_ps(ptrA+4,tB);
+ _mm_storeu_ps(ptrA+8,tC);
+
+}
+
+
+
+static void
+gmx_mm_increment_1rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC,float *ptrD,
+ __m128 x1, __m128 y1, __m128 z1)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+ t5 = _mm_unpacklo_ps(y1,z1);
+ t6 = _mm_unpackhi_ps(y1,z1);
+ t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
+ t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
+ t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
+ t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
+ t1 = _mm_load_ss(ptrA);
+ t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
+ t1 = _mm_add_ps(t1,t7);
+ _mm_store_ss(ptrA,t1);
+ _mm_storeh_pi((__m64 *)(ptrA+1),t1);
+ t2 = _mm_load_ss(ptrB);
+ t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
+ t2 = _mm_add_ps(t2,t8);
+ _mm_store_ss(ptrB,t2);
+ _mm_storeh_pi((__m64 *)(ptrB+1),t2);
+ t3 = _mm_load_ss(ptrC);
+ t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
+ t3 = _mm_add_ps(t3,t9);
+ _mm_store_ss(ptrC,t3);
+ _mm_storeh_pi((__m64 *)(ptrC+1),t3);
+ t4 = _mm_load_ss(ptrD);
+ t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
+ t4 = _mm_add_ps(t4,t10);
+ _mm_store_ss(ptrD,t4);
+ _mm_storeh_pi((__m64 *)(ptrD+1),t4);
+}
+
+
+
+
+static void
+gmx_mm_increment_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 x1, __m128 y1, __m128 z1,
+ __m128 x2, __m128 y2, __m128 z2,
+ __m128 x3, __m128 y3, __m128 z3)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+ __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
+ __m128 t20,t21,t22,t23,t24,t25;
+
+ t13 = _mm_unpackhi_ps(x1,y1);
+ x1 = _mm_unpacklo_ps(x1,y1);
+ t14 = _mm_unpackhi_ps(z1,x2);
+ z1 = _mm_unpacklo_ps(z1,x2);
+ t15 = _mm_unpackhi_ps(y2,z2);
+ y2 = _mm_unpacklo_ps(y2,z2);
+ t16 = _mm_unpackhi_ps(x3,y3);
+ x3 = _mm_unpacklo_ps(x3,y3);
+ t17 = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
+ t18 = _mm_movehl_ps(z3,z3);
+ t19 = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
+ t20 = _mm_movelh_ps(x1,z1);
+ t21 = _mm_movehl_ps(z1,x1);
+ t22 = _mm_movelh_ps(t13,t14);
+ t14 = _mm_movehl_ps(t14,t13);
+ t23 = _mm_movelh_ps(y2,x3);
+ t24 = _mm_movehl_ps(x3,y2);
+ t25 = _mm_movelh_ps(t15,t16);
+ t16 = _mm_movehl_ps(t16,t15);
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrA+4);
+ t3 = _mm_load_ss(ptrA+8);
+ t1 = _mm_add_ps(t1,t20);
+ t2 = _mm_add_ps(t2,t23);
+ t3 = _mm_add_ss(t3,z3);
+ _mm_storeu_ps(ptrA,t1);
+ _mm_storeu_ps(ptrA+4,t2);
+ _mm_store_ss(ptrA+8,t3);
+ t4 = _mm_loadu_ps(ptrB);
+ t5 = _mm_loadu_ps(ptrB+4);
+ t6 = _mm_load_ss(ptrB+8);
+ t4 = _mm_add_ps(t4,t21);
+ t5 = _mm_add_ps(t5,t24);
+ t6 = _mm_add_ss(t6,t17);
+ _mm_storeu_ps(ptrB,t4);
+ _mm_storeu_ps(ptrB+4,t5);
+ _mm_store_ss(ptrB+8,t6);
+ t7 = _mm_loadu_ps(ptrC);
+ t8 = _mm_loadu_ps(ptrC+4);
+ t9 = _mm_load_ss(ptrC+8);
+ t7 = _mm_add_ps(t7,t22);
+ t8 = _mm_add_ps(t8,t25);
+ t9 = _mm_add_ss(t9,t18);
+ _mm_storeu_ps(ptrC,t7);
+ _mm_storeu_ps(ptrC+4,t8);
+ _mm_store_ss(ptrC+8,t9);
+ t10 = _mm_loadu_ps(ptrD);
+ t11 = _mm_loadu_ps(ptrD+4);
+ t12 = _mm_load_ss(ptrD+8);
+ t10 = _mm_add_ps(t10,t14);
+ t11 = _mm_add_ps(t11,t16);
+ t12 = _mm_add_ss(t12,t19);
+ _mm_storeu_ps(ptrD,t10);
+ _mm_storeu_ps(ptrD+4,t11);
+ _mm_store_ss(ptrD+8,t12);
+}
+
+
+static void
+gmx_mm_increment_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 x1, __m128 y1, __m128 z1,
+ __m128 x2, __m128 y2, __m128 z2,
+ __m128 x3, __m128 y3, __m128 z3,
+ __m128 x4, __m128 y4, __m128 z4)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
+ __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
+ __m128 t23,t24;
+ t13 = _mm_unpackhi_ps(x1,y1);
+ x1 = _mm_unpacklo_ps(x1,y1);
+ t14 = _mm_unpackhi_ps(z1,x2);
+ z1 = _mm_unpacklo_ps(z1,x2);
+ t15 = _mm_unpackhi_ps(y2,z2);
+ y2 = _mm_unpacklo_ps(y2,z2);
+ t16 = _mm_unpackhi_ps(x3,y3);
+ x3 = _mm_unpacklo_ps(x3,y3);
+ t17 = _mm_unpackhi_ps(z3,x4);
+ z3 = _mm_unpacklo_ps(z3,x4);
+ t18 = _mm_unpackhi_ps(y4,z4);
+ y4 = _mm_unpacklo_ps(y4,z4);
+ t19 = _mm_movelh_ps(x1,z1);
+ z1 = _mm_movehl_ps(z1,x1);
+ t20 = _mm_movelh_ps(t13,t14);
+ t14 = _mm_movehl_ps(t14,t13);
+ t21 = _mm_movelh_ps(y2,x3);
+ x3 = _mm_movehl_ps(x3,y2);
+ t22 = _mm_movelh_ps(t15,t16);
+ t16 = _mm_movehl_ps(t16,t15);
+ t23 = _mm_movelh_ps(z3,y4);
+ y4 = _mm_movehl_ps(y4,z3);
+ t24 = _mm_movelh_ps(t17,t18);
+ t18 = _mm_movehl_ps(t18,t17);
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrA+4);
+ t3 = _mm_loadu_ps(ptrA+8);
+ t1 = _mm_add_ps(t1,t19);
+ t2 = _mm_add_ps(t2,t21);
+ t3 = _mm_add_ps(t3,t23);
+ _mm_storeu_ps(ptrA,t1);
+ _mm_storeu_ps(ptrA+4,t2);
+ _mm_storeu_ps(ptrA+8,t3);
+ t4 = _mm_loadu_ps(ptrB);
+ t5 = _mm_loadu_ps(ptrB+4);
+ t6 = _mm_loadu_ps(ptrB+8);
+ t4 = _mm_add_ps(t4,z1);
+ t5 = _mm_add_ps(t5,x3);
+ t6 = _mm_add_ps(t6,y4);
+ _mm_storeu_ps(ptrB,t4);
+ _mm_storeu_ps(ptrB+4,t5);
+ _mm_storeu_ps(ptrB+8,t6);
+ t7 = _mm_loadu_ps(ptrC);
+ t8 = _mm_loadu_ps(ptrC+4);
+ t9 = _mm_loadu_ps(ptrC+8);
+ t7 = _mm_add_ps(t7,t20);
+ t8 = _mm_add_ps(t8,t22);
+ t9 = _mm_add_ps(t9,t24);
+ _mm_storeu_ps(ptrC,t7);
+ _mm_storeu_ps(ptrC+4,t8);
+ _mm_storeu_ps(ptrC+8,t9);
+ t10 = _mm_loadu_ps(ptrD);
+ t11 = _mm_loadu_ps(ptrD+4);
+ t12 = _mm_loadu_ps(ptrD+8);
+ t10 = _mm_add_ps(t10,t14);
+ t11 = _mm_add_ps(t11,t16);
+ t12 = _mm_add_ps(t12,t18);
+ _mm_storeu_ps(ptrD,t10);
+ _mm_storeu_ps(ptrD+4,t11);
+ _mm_storeu_ps(ptrD+8,t12);
+}
+
+
+/* Routines to decrement rvec in memory */
+static void
+gmx_mm_decrement_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
+{
+ __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
+ __m128 t1;
+
+ t1 = _mm_loadu_ps(ptrA);
+ xyz = _mm_and_ps(mask,xyz);
+ t1 = _mm_sub_ps(t1,xyz);
+ _mm_storeu_ps(ptrA,t1);
+}
+
+
+static void
+gmx_mm_decrement_3rvec_1ptr_noswizzle_ps(float *ptrA,
+ __m128 xyz1, __m128 xyz2, __m128 xyz3)
+{
+ __m128 t1,t2,t3,t4;
+ __m128 tA,tB,tC;
+
+ tA = _mm_loadu_ps(ptrA);
+ tB = _mm_loadu_ps(ptrA+4);
+ tC = _mm_load_ss(ptrA+8);
+
+ t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+ t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+ t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
+ t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+ t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+
+ tA = _mm_sub_ps(tA,t3);
+ tB = _mm_sub_ps(tB,t4);
+ tC = _mm_sub_ss(tC,t2);
+
+ _mm_storeu_ps(ptrA,tA);
+ _mm_storeu_ps(ptrA+4,tB);
+ _mm_store_ss(ptrA+8,tC);
+}
+
+static void
+gmx_mm_decrement_4rvec_1ptr_noswizzle_ps(float *ptrA,
+ __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
+{
+ __m128 t1,t2,t3,t4,t5;
+ __m128 tA,tB,tC;
+
+ tA = _mm_loadu_ps(ptrA);
+ tB = _mm_loadu_ps(ptrA+4);
+ tC = _mm_loadu_ps(ptrA+8);
+
+ t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+ t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+ t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
+ t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+ t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+
+ t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
+ t2 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0)); /* x4 x4 z3 z3 */
+ t5 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
+
+ tA = _mm_sub_ps(tA,t3);
+ tB = _mm_sub_ps(tB,t4);
+ tC = _mm_sub_ps(tC,t5);
+
+ _mm_storeu_ps(ptrA,tA);
+ _mm_storeu_ps(ptrA+4,tB);
+ _mm_storeu_ps(ptrA+8,tC);
+
+}
+
+
+static void
+gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
+ float * gmx_restrict ptrB,
+ float * gmx_restrict ptrC,
+ float * gmx_restrict ptrD,
+ __m128 x1, __m128 y1, __m128 z1)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+ t5 = _mm_unpacklo_ps(y1,z1);
+ t6 = _mm_unpackhi_ps(y1,z1);
+ t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
+ t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
+ t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
+ t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
+ t1 = _mm_load_ss(ptrA);
+ t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
+ t1 = _mm_sub_ps(t1,t7);
+ _mm_store_ss(ptrA,t1);
+ _mm_storeh_pi((__m64 *)(ptrA+1),t1);
+ t2 = _mm_load_ss(ptrB);
+ t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
+ t2 = _mm_sub_ps(t2,t8);
+ _mm_store_ss(ptrB,t2);
+ _mm_storeh_pi((__m64 *)(ptrB+1),t2);
+ t3 = _mm_load_ss(ptrC);
+ t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
+ t3 = _mm_sub_ps(t3,t9);
+ _mm_store_ss(ptrC,t3);
+ _mm_storeh_pi((__m64 *)(ptrC+1),t3);
+ t4 = _mm_load_ss(ptrD);
+ t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
+ t4 = _mm_sub_ps(t4,t10);
+ _mm_store_ss(ptrD,t4);
+ _mm_storeh_pi((__m64 *)(ptrD+1),t4);
+}
+
+
+
+static void
+gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 x1, __m128 y1, __m128 z1,
+ __m128 x2, __m128 y2, __m128 z2,
+ __m128 x3, __m128 y3, __m128 z3)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+ __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
+ __m128 t20,t21,t22,t23,t24,t25;
+
+ t13 = _mm_unpackhi_ps(x1,y1);
+ x1 = _mm_unpacklo_ps(x1,y1);
+ t14 = _mm_unpackhi_ps(z1,x2);
+ z1 = _mm_unpacklo_ps(z1,x2);
+ t15 = _mm_unpackhi_ps(y2,z2);
+ y2 = _mm_unpacklo_ps(y2,z2);
+ t16 = _mm_unpackhi_ps(x3,y3);
+ x3 = _mm_unpacklo_ps(x3,y3);
+ t17 = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
+ t18 = _mm_movehl_ps(z3,z3);
+ t19 = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
+ t20 = _mm_movelh_ps(x1,z1);
+ t21 = _mm_movehl_ps(z1,x1);
+ t22 = _mm_movelh_ps(t13,t14);
+ t14 = _mm_movehl_ps(t14,t13);
+ t23 = _mm_movelh_ps(y2,x3);
+ t24 = _mm_movehl_ps(x3,y2);
+ t25 = _mm_movelh_ps(t15,t16);
+ t16 = _mm_movehl_ps(t16,t15);
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrA+4);
+ t3 = _mm_load_ss(ptrA+8);
+ t1 = _mm_sub_ps(t1,t20);
+ t2 = _mm_sub_ps(t2,t23);
+ t3 = _mm_sub_ss(t3,z3);
+ _mm_storeu_ps(ptrA,t1);
+ _mm_storeu_ps(ptrA+4,t2);
+ _mm_store_ss(ptrA+8,t3);
+ t4 = _mm_loadu_ps(ptrB);
+ t5 = _mm_loadu_ps(ptrB+4);
+ t6 = _mm_load_ss(ptrB+8);
+ t4 = _mm_sub_ps(t4,t21);
+ t5 = _mm_sub_ps(t5,t24);
+ t6 = _mm_sub_ss(t6,t17);
+ _mm_storeu_ps(ptrB,t4);
+ _mm_storeu_ps(ptrB+4,t5);
+ _mm_store_ss(ptrB+8,t6);
+ t7 = _mm_loadu_ps(ptrC);
+ t8 = _mm_loadu_ps(ptrC+4);
+ t9 = _mm_load_ss(ptrC+8);
+ t7 = _mm_sub_ps(t7,t22);
+ t8 = _mm_sub_ps(t8,t25);
+ t9 = _mm_sub_ss(t9,t18);
+ _mm_storeu_ps(ptrC,t7);
+ _mm_storeu_ps(ptrC+4,t8);
+ _mm_store_ss(ptrC+8,t9);
+ t10 = _mm_loadu_ps(ptrD);
+ t11 = _mm_loadu_ps(ptrD+4);
+ t12 = _mm_load_ss(ptrD+8);
+ t10 = _mm_sub_ps(t10,t14);
+ t11 = _mm_sub_ps(t11,t16);
+ t12 = _mm_sub_ss(t12,t19);
+ _mm_storeu_ps(ptrD,t10);
+ _mm_storeu_ps(ptrD+4,t11);
+ _mm_store_ss(ptrD+8,t12);
+}
+
+
+static void
+gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+ __m128 x1, __m128 y1, __m128 z1,
+ __m128 x2, __m128 y2, __m128 z2,
+ __m128 x3, __m128 y3, __m128 z3,
+ __m128 x4, __m128 y4, __m128 z4)
+{
+ __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
+ __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
+ __m128 t23,t24;
+ t13 = _mm_unpackhi_ps(x1,y1);
+ x1 = _mm_unpacklo_ps(x1,y1);
+ t14 = _mm_unpackhi_ps(z1,x2);
+ z1 = _mm_unpacklo_ps(z1,x2);
+ t15 = _mm_unpackhi_ps(y2,z2);
+ y2 = _mm_unpacklo_ps(y2,z2);
+ t16 = _mm_unpackhi_ps(x3,y3);
+ x3 = _mm_unpacklo_ps(x3,y3);
+ t17 = _mm_unpackhi_ps(z3,x4);
+ z3 = _mm_unpacklo_ps(z3,x4);
+ t18 = _mm_unpackhi_ps(y4,z4);
+ y4 = _mm_unpacklo_ps(y4,z4);
+ t19 = _mm_movelh_ps(x1,z1);
+ z1 = _mm_movehl_ps(z1,x1);
+ t20 = _mm_movelh_ps(t13,t14);
+ t14 = _mm_movehl_ps(t14,t13);
+ t21 = _mm_movelh_ps(y2,x3);
+ x3 = _mm_movehl_ps(x3,y2);
+ t22 = _mm_movelh_ps(t15,t16);
+ t16 = _mm_movehl_ps(t16,t15);
+ t23 = _mm_movelh_ps(z3,y4);
+ y4 = _mm_movehl_ps(y4,z3);
+ t24 = _mm_movelh_ps(t17,t18);
+ t18 = _mm_movehl_ps(t18,t17);
+ t1 = _mm_loadu_ps(ptrA);
+ t2 = _mm_loadu_ps(ptrA+4);
+ t3 = _mm_loadu_ps(ptrA+8);
+ t1 = _mm_sub_ps(t1,t19);
+ t2 = _mm_sub_ps(t2,t21);
+ t3 = _mm_sub_ps(t3,t23);
+ _mm_storeu_ps(ptrA,t1);
+ _mm_storeu_ps(ptrA+4,t2);
+ _mm_storeu_ps(ptrA+8,t3);
+ t4 = _mm_loadu_ps(ptrB);
+ t5 = _mm_loadu_ps(ptrB+4);
+ t6 = _mm_loadu_ps(ptrB+8);
+ t4 = _mm_sub_ps(t4,z1);
+ t5 = _mm_sub_ps(t5,x3);
+ t6 = _mm_sub_ps(t6,y4);
+ _mm_storeu_ps(ptrB,t4);
+ _mm_storeu_ps(ptrB+4,t5);
+ _mm_storeu_ps(ptrB+8,t6);
+ t7 = _mm_loadu_ps(ptrC);
+ t8 = _mm_loadu_ps(ptrC+4);
+ t9 = _mm_loadu_ps(ptrC+8);
+ t7 = _mm_sub_ps(t7,t20);
+ t8 = _mm_sub_ps(t8,t22);
+ t9 = _mm_sub_ps(t9,t24);
+ _mm_storeu_ps(ptrC,t7);
+ _mm_storeu_ps(ptrC+4,t8);
+ _mm_storeu_ps(ptrC+8,t9);
+ t10 = _mm_loadu_ps(ptrD);
+ t11 = _mm_loadu_ps(ptrD+4);
+ t12 = _mm_loadu_ps(ptrD+8);
+ t10 = _mm_sub_ps(t10,t14);
+ t11 = _mm_sub_ps(t11,t16);
+ t12 = _mm_sub_ps(t12,t18);
+ _mm_storeu_ps(ptrD,t10);
+ _mm_storeu_ps(ptrD+4,t11);
+ _mm_storeu_ps(ptrD+8,t12);
+}
+
+
+
+static gmx_inline void
+gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+ float *fptr,
+ float *fshiftptr)
+{
+ __m128 t1,t2,t3;
+
+ /* transpose data */
+ t1 = fix1;
+ _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
+
+ t2 = _mm_load_ss(fptr);
+ t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+ t3 = _mm_load_ss(fshiftptr);
+ t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+ t2 = _mm_add_ps(t2,fix1);
+ t3 = _mm_add_ps(t3,fix1);
+
+ _mm_store_ss(fptr,t2);
+ _mm_storeh_pi((__m64 *)(fptr+1),t2);
+ _mm_store_ss(fshiftptr,t3);
+ _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+}
+
+static gmx_inline void
+gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+ __m128 fix2, __m128 fiy2, __m128 fiz2,
+ float *fptr,
+ float *fshiftptr)
+{
+ __m128 t1,t2,t4;
+
+ /* transpose data */
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
+ t1 = _mm_unpacklo_ps(fiy2,fiz2);
+ t2 = _mm_unpackhi_ps(fiy2,fiz2);
+
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
+ t1 = _mm_add_ps(t1,t2);
+ t2 = _mm_movehl_ps(t2,t1);
+ fiy2 = _mm_add_ps(t1,t2);
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
+ _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
+
+ t4 = _mm_load_ss(fshiftptr+2);
+ t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
+ t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
+ t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
+
+ t1 = _mm_add_ps(t1,t2);
+ t1 = _mm_add_ps(t1,t4); /* y x - z */
+
+ _mm_store_ss(fshiftptr+2,t1);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+
+
+
+static gmx_inline void
+gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+ __m128 fix2, __m128 fiy2, __m128 fiz2,
+ __m128 fix3, __m128 fiy3, __m128 fiz3,
+ float *fptr,
+ float *fshiftptr)
+{
+ __m128 t1,t2,t3,t4;
+
+ /* transpose data */
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
+ _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
+ t2 = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
+ t1 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
+ t3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
+
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
+ fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
+ fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1) , _mm_add_ps(t2,t3));
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+ t4 = _mm_load_ss(fshiftptr+2);
+ t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
+ t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
+ t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
+ t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
+
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ps(t3,t4);
+ t1 = _mm_add_ps(t1,t3); /* y x - z */
+
+ _mm_store_ss(fshiftptr+2,t1);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+
+
+static gmx_inline void
+gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+ __m128 fix2, __m128 fiy2, __m128 fiz2,
+ __m128 fix3, __m128 fiy3, __m128 fiz3,
+ __m128 fix4, __m128 fiy4, __m128 fiz4,
+ float *fptr,
+ float *fshiftptr)
+{
+ __m128 t1,t2,t3,t4,t5;
+
+ /* transpose data */
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
+ _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
+ _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
+
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
+ fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
+ fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+ t5 = _mm_load_ss(fshiftptr+2);
+ t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+ t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+ t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+ t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+ t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
+
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ps(t3,t4);
+ t1 = _mm_add_ps(t1,t3);
+ t5 = _mm_add_ps(t5,t1);
+
+ _mm_store_ss(fshiftptr+2,t5);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+}
+
+
+
+static void
+gmx_mm_update_1pot_ps(__m128 pot1, float *ptrA)
+{
+ pot1 = _mm_add_ps(pot1,_mm_movehl_ps(_mm_setzero_ps(),pot1));
+ pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
+ _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+}
+
+static void
+gmx_mm_update_2pot_ps(__m128 pot1, float *ptrA,
+ __m128 pot2, float *ptrB)
+{
+ __m128 t1,t2;
+ t1 = _mm_movehl_ps(pot2,pot1);
+ t2 = _mm_movelh_ps(pot1,pot2);
+ t1 = _mm_add_ps(t1,t2);
+ t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+ pot1 = _mm_add_ps(t1,t2);
+ pot2 = _mm_movehl_ps(t2,pot1);
+ _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+ _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
+}
+
+
+static void
+gmx_mm_update_4pot_ps(__m128 pot1, float *ptrA,
+ __m128 pot2, float *ptrB,
+ __m128 pot3, float *ptrC,
+ __m128 pot4, float *ptrD)
+{
+ _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
+ pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
+ pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
+ pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
+ pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
+ _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+ _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
+ _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
+ _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+}
+
+
+#endif /* _kernelutil_x86_sse2_single_h_ */
--- /dev/null
+#!/usr/bin/python
+
+import sys
+import os
+sys.path.append ( "../preprocessor" )
+from gmxpreprocess import gmxpreprocess
+
+# "The happiest programs are programs that write other programs."
+#
+#
+# This script controls the generation of Gromacs nonbonded kernels.
+#
+# We no longer generate kernels on-the-fly, so this file is not run
+# during a Gromacs compile - only when we need to update the kernels (=rarely).
+#
+# To maximize performance, each combination of interactions in Gromacs
+# has a separate nonbonded kernel without conditionals in the code.
+# To avoid writing hundreds of different routines for each architecture,
+# we instead use a custom preprocessor so we can encode the conditionals
+# and expand for-loops (e.g, for water-water interactions)
+# from a general kernel template. While that file will contain quite a
+# few preprocessor directives, it is still an order of magnitude easier
+# to maintain than ~200 different kernels (not to mention it avoids bugs).
+#
+# To actually generate the kernels, this program iteratively calls the
+# preprocessor with different define settings corresponding to all
+# combinations of coulomb/van-der-Waals/geometry options.
+#
+# A main goal in the design was to make this new generator _general_. For
+# this reason we have used a lot of different fields to identify a particular
+# kernel and interaction. Basically, each kernel will have a name like
+#
+# nbkernel_ElecXX_VdwYY_GeomZZ_VF_QQ()
+#
+# Where XX/YY/ZZ/VF are strings to identify what the kernel computes.
+#
+# Elec/Vdw describe the type of interaction for electrostatics and van der Waals.
+# The geometry settings correspond e.g. to water-water or water-particle kernels,
+# and finally the VF setting is V,F,or VF depending on whether we calculate
+# only the potential, only the force, or both of them. The final string (QQ)
+# is the architecture/language/optimization of the kernel.
+#
+Arch = 'sse2_single'
+
+# Explanation of the 'properties':
+#
+# It is cheap to compute r^2, and the kernels require various other functions of r for
+# different kinds of interaction. Depending on the needs of the kernel and the available
+# processor instructions, this will be done in different ways.
+#
+# 'rinv' means we need 1/r, which is calculated as 1/sqrt(r^2).
+# 'rinvsq' means we need 1/(r*r). This is calculated as rinv*rinv if we already did rinv, otherwise 1/r^2.
+# 'r' is similarly calculated as r^2*rinv when needed
+# 'table' means the interaction is tabulated, in which case we will calculate a table index before the interaction
+# 'shift' means the interaction will be modified by a constant to make it zero at the cutoff.
+# 'cutoff' means the interaction is set to 0.0 outside the cutoff
+#
+
+FileHeader = \
+'/*\n' \
+' * Note: this file was generated by the Gromacs '+Arch+' kernel generator.\n' \
+' *\n' \
+' * This source code is part of\n' \
+' *\n' \
+' * G R O M A C S\n' \
+' *\n' \
+' * Copyright (c) 2001-2012, The GROMACS Development Team\n' \
+' *\n' \
+' * Gromacs is a library for molecular simulation and trajectory analysis,\n' \
+' * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for\n' \
+' * a full list of developers and information, check out http://www.gromacs.org\n' \
+' *\n' \
+' * This program is free software; you can redistribute it and/or modify it under\n' \
+' * the terms of the GNU Lesser General Public License as published by the Free\n' \
+' * Software Foundation; either version 2 of the License, or (at your option) any\n' \
+' * later version.\n' \
+' *\n' \
+' * To help fund GROMACS development, we humbly ask that you cite\n' \
+' * the papers people have written on it - you can find them on the website.\n' \
+' */\n'
+
+###############################################
+# ELECTROSTATICS
+# Interactions and flags for them
+###############################################
+ElectrostaticsList = {
+ 'None' : [],
+ 'Coulomb' : ['rinv','rinvsq'],
+ 'ReactionField' : ['rinv','rinvsq'],
+ 'GeneralizedBorn' : ['rinv','r'],
+ 'CubicSplineTable' : ['rinv','r','table'],
+ 'Ewald' : ['rinv','rinvsq','r'],
+}
+
+
+###############################################
+# VAN DER WAALS
+# Interactions and flags for them
+###############################################
+VdwList = {
+ 'None' : [],
+ 'LennardJones' : ['rinvsq'],
+# 'Buckingham' : ['rinv','rinvsq','r'], # Disabled for sse2 to reduce number of kernels and simply the template
+ 'CubicSplineTable' : ['rinv','r','table'],
+}
+
+
+###############################################
+# MODIFIERS
+# Different ways to adjust/modify interactions to conserve energy
+###############################################
+ModifierList = {
+ 'None' : [],
+ 'ExactCutoff' : ['exactcutoff'], # Zero the interaction outside the cutoff, used for reaction-field-zero
+ 'PotentialShift' : ['shift','exactcutoff'],
+ 'PotentialSwitch' : ['rinv','r','switch','exactcutoff']
+}
+
+
+###############################################
+# GEOMETRY COMBINATIONS
+###############################################
+GeometryNameList = [
+ [ 'Particle' , 'Particle' ],
+ [ 'Water3' , 'Particle' ],
+ [ 'Water3' , 'Water3' ],
+ [ 'Water4' , 'Particle' ],
+ [ 'Water4' , 'Water4' ]
+]
+
+
+###############################################
+# POTENTIAL / FORCE
+###############################################
+VFList = [
+ 'PotentialAndForce',
+# 'Potential', # Not used yet
+ 'Force'
+]
+
+
+###############################################
+# GEOMETRY PROPERTIES
+###############################################
+# Dictionaries with lists telling which interactions are present
+# 1,2,3 means particles 1,2,3 (but not 0) have electrostatics!
+GeometryElectrostatics = {
+ 'Particle' : [ 0 ],
+ 'Particle2' : [ 0 , 1 ],
+ 'Particle3' : [ 0 , 1 , 2 ],
+ 'Particle4' : [ 0 , 1 , 2 , 3 ],
+ 'Water3' : [ 0 , 1 , 2 ],
+ 'Water4' : [ 1 , 2 , 3 ]
+}
+
+GeometryVdw = {
+ 'Particle' : [ 0 ],
+ 'Particle2' : [ 0 , 1 ],
+ 'Particle3' : [ 0 , 1 , 2 ],
+ 'Particle4' : [ 0 , 1 , 2 , 3 ],
+ 'Water3' : [ 0 ],
+ 'Water4' : [ 0 ]
+}
+
+
+
+
+# Dictionary to abbreviate all strings (mixed from all the lists)
+Abbreviation = {
+ 'None' : 'None',
+ 'Coulomb' : 'Coul',
+ 'Ewald' : 'Ew',
+ 'ReactionField' : 'RF',
+ 'GeneralizedBorn' : 'GB',
+ 'CubicSplineTable' : 'CSTab',
+ 'LennardJones' : 'LJ',
+ 'Buckingham' : 'Bham',
+ 'PotentialShift' : 'Sh',
+ 'PotentialSwitch' : 'Sw',
+ 'ExactCutoff' : 'Cut',
+ 'PotentialAndForce' : 'VF',
+ 'Potential' : 'V',
+ 'Force' : 'F',
+ 'Water3' : 'W3',
+ 'Water4' : 'W4',
+ 'Particle' : 'P1',
+ 'Particle2' : 'P2',
+ 'Particle3' : 'P3',
+ 'Particle4' : 'P4'
+}
+
+
+###############################################
+# Functions
+###############################################
+
+# Return a string with the kernel name from current settings
+def MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom):
+ ElecStr = 'Elec' + Abbreviation[KernelElec]
+ if(KernelElecMod!='None'):
+ ElecStr = ElecStr + Abbreviation[KernelElecMod]
+ VdwStr = 'Vdw' + Abbreviation[KernelVdw]
+ if(KernelVdwMod!='None'):
+ VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+ GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+ return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + Arch
+
+def MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+ ElecStr = 'Elec' + Abbreviation[KernelElec]
+ if(KernelElecMod!='None'):
+ ElecStr = ElecStr + Abbreviation[KernelElecMod]
+ VdwStr = 'Vdw' + Abbreviation[KernelVdw]
+ if(KernelVdwMod!='None'):
+ VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+ GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+ VFStr = Abbreviation[KernelVF]
+ return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + VFStr + '_' + Arch
+
+# Return a string with a declaration to use for the kernel;
+# this will be a sequence of string combinations as well as the actual function name
+# Dont worry about field widths - that is just pretty-printing for the header!
+def MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF):
+ KernelStr = '\"'+KernelName+'\"'
+ ArchStr = '\"'+Arch+'\"'
+ ElecStr = '\"'+KernelElec+'\"'
+ ElecModStr = '\"'+KernelElecMod+'\"'
+ VdwStr = '\"'+KernelVdw+'\"'
+ VdwModStr = '\"'+KernelVdwMod+'\"'
+ GeomStr = '\"'+KernelGeom[0]+KernelGeom[1]+'\"'
+ OtherStr = '\"'+KernelOther+'\"'
+ VFStr = '\"'+KernelVF+'\"'
+
+ ThisSpec = ArchStr+', '+ElecStr+', '+ElecModStr+', '+VdwStr+', '+VdwModStr+', '+GeomStr+', '+OtherStr+', '+VFStr
+ ThisDecl = ' { '+KernelName+', '+KernelStr+', '+ThisSpec+' }'
+ return ThisDecl
+
+
+# Returns 1 if this kernel should be created, 0 if we should skip it
+# This routine is not critical - it is not the end of the world if we create more kernels,
+# but since the number is pretty large we save both space and compile-time by reducing it a bit.
+def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+
+ # No need for kernels without interactions
+ if(KernelElec=='None' and KernelVdw=='None'):
+ return 0
+
+ # No need for modifiers without interactions
+ if((KernelElec=='None' and KernelElecMod!='None') or (KernelVdw=='None' and KernelVdwMod!='None')):
+ return 0
+
+ # No need for LJ-only water optimization, or water optimization with implicit solvent.
+ if('Water' in KernelGeom[0] and (KernelElec=='None' or 'GeneralizedBorn' in KernelElec)):
+ return 0
+
+ # Non-matching table settings are pointless
+ if( ('Table' in KernelElec) and ('Table' in KernelVdw) and KernelElec!=KernelVdw ):
+ return 0
+
+ # Try to reduce the number of different switch/shift options to get a reasonable number of kernels
+ # For electrostatics, reaction-field can use 'exactcutoff', and ewald can use switch or shift.
+ if(KernelElecMod=='ExactCutoff' and KernelElec!='ReactionField'):
+ return 0
+ if(KernelElecMod in ['PotentialShift','PotentialSwitch'] and KernelElec!='Ewald'):
+ return 0
+ # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
+ if((KernelVdwMod=='ExactCutoff') or
+ (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+ return 0
+
+ # Choose either switch or shift and don't mix them...
+ if((KernelElecMod=='PotentialShift' and KernelVdwMod=='PotentialSwitch') or
+ (KernelElecMod=='PotentialSwitch' and KernelVdwMod=='PotentialShift')):
+ return 0
+
+ # Don't use a Vdw kernel with a modifier if the electrostatics one does not have one
+ if(KernelElec!='None' and KernelElecMod=='None' and KernelVdwMod!='None'):
+ return 0
+
+ # Don't use an electrostatics kernel with a modifier if the vdw one does not have one,
+ # unless the electrostatics one is reaction-field with exact cutoff.
+ if(KernelVdw!='None' and KernelVdwMod=='None' and KernelElecMod!='None'):
+ if(KernelElec=='ReactionField' and KernelVdw!='CubicSplineTable'):
+ return 0
+ elif(KernelElec!='ReactionField'):
+ return 0
+
+ return 1
+
+
+
+#
+# The preprocessor will automatically expand the interactions for water and other
+# geometries inside the kernel, but to get this right we need to setup a couple
+# of defines - we do them in a separate routine to keep the main loop clean.
+#
+# While this routine might look a bit complex it is actually quite straightforward,
+# and the best news is that you wont have to modify _anything_ for a new geometry
+# as long as you correctly define its Electrostatics/Vdw geometry in the lists above!
+#
+def SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines):
+ # What is the _name_ for the i/j group geometry?
+ igeometry = KernelGeom[0]
+ jgeometry = KernelGeom[1]
+ # define so we can access it in the source when the preprocessor runs
+ defines['GEOMETRY_I'] = igeometry
+ defines['GEOMETRY_J'] = jgeometry
+
+ # For the i/j groups, extract a python list of which sites have electrostatics
+ # For SPC/TIP3p this will be [1,1,1], while TIP4p (no elec on first site) will be [0,1,1,1]
+ ielec = GeometryElectrostatics[igeometry]
+ jelec = GeometryElectrostatics[jgeometry]
+ # Zero out the corresponding lists in case we dont do Elec
+ if(KernelElec=='None'):
+ ielec = []
+ jelec = []
+
+ # Extract similar interaction lists for Vdw interactions (example for SPC: [1,0,0])
+ iVdw = GeometryVdw[igeometry]
+ jVdw = GeometryVdw[jgeometry]
+
+ # Zero out the corresponding lists in case we dont do Vdw
+ if(KernelVdw=='None'):
+ iVdw = []
+ jVdw = []
+
+ # iany[] and jany[] contains lists of the particles actually used (for interactions) in this kernel
+ iany = list(set(ielec+iVdw)) # convert to+from set to make elements unique
+ jany = list(set(jelec+jVdw))
+
+ defines['PARTICLES_ELEC_I'] = ielec
+ defines['PARTICLES_ELEC_J'] = jelec
+ defines['PARTICLES_VDW_I'] = iVdw
+ defines['PARTICLES_VDW_J'] = jVdw
+ defines['PARTICLES_I'] = iany
+ defines['PARTICLES_J'] = jany
+
+ # elecij,Vdwij are sets with pairs of particles for which the corresponding interaction is done
+ # (and anyij again corresponds to either electrostatics or Vdw)
+ elecij = []
+ Vdwij = []
+ anyij = []
+
+ for i in ielec:
+ for j in jelec:
+ elecij.append([i,j])
+
+ for i in iVdw:
+ for j in jVdw:
+ Vdwij.append([i,j])
+
+ for i in iany:
+ for j in jany:
+ if [i,j] in elecij or [i,j] in Vdwij:
+ anyij.append([i,j])
+
+ defines['PAIRS_IJ'] = anyij
+
+ # Make an 2d list-of-distance-properties-to-calculate for i,j
+ ni = max(iany)+1
+ nj = max(jany)+1
+ # Each element properties[i][j] is an empty list
+ properties = [ [ [] for j in range(0,nj) ] for i in range (0,ni) ]
+ # Add properties to each set
+ for i in range(0,ni):
+ for j in range(0,nj):
+ if [i,j] in elecij:
+ properties[i][j] = properties[i][j] + ['electrostatics'] + ElectrostaticsList[KernelElec] + ModifierList[KernelElecMod]
+ if [i,j] in Vdwij:
+ properties[i][j] = properties[i][j] + ['vdw'] + VdwList[KernelVdw] + ModifierList[KernelVdwMod]
+ # Add rinv if we need r
+ if 'r' in properties[i][j]:
+ properties[i][j] = properties[i][j] + ['rinv']
+ # Add rsq if we need rinv or rinsq
+ if 'rinv' in properties[i][j] or 'rinvsq' in properties[i][j]:
+ properties[i][j] = properties[i][j] + ['rsq']
+
+ defines['INTERACTION_FLAGS'] = properties
+
+
+
+def PrintStatistics(ratio):
+ ratio = 100.0*ratio
+ print '\rGenerating %s nonbonded kernels... %5.1f%%' % (Arch,ratio),
+ sys.stdout.flush()
+
+
+
+defines = {}
+kerneldecl = []
+
+cnt = 0.0
+nelec = len(ElectrostaticsList)
+nVdw = len(VdwList)
+nmod = len(ModifierList)
+ngeom = len(GeometryNameList)
+
+ntot = nelec*nmod*nVdw*nmod*ngeom
+
+numKernels = 0
+
+fpdecl = open('nb_kernel_' + Arch + '.c','w')
+fpdecl.write( FileHeader )
+fpdecl.write( '#ifndef nb_kernel_' + Arch + '_h\n' )
+fpdecl.write( '#define nb_kernel_' + Arch + '_h\n\n' )
+fpdecl.write( '#include "../nb_kernel.h"\n\n' )
+
+for KernelElec in ElectrostaticsList:
+ defines['KERNEL_ELEC'] = KernelElec
+
+ for KernelElecMod in ModifierList:
+ defines['KERNEL_MOD_ELEC'] = KernelElecMod
+
+ for KernelVdw in VdwList:
+ defines['KERNEL_VDW'] = KernelVdw
+
+ for KernelVdwMod in ModifierList:
+ defines['KERNEL_MOD_VDW'] = KernelVdwMod
+
+ for KernelGeom in GeometryNameList:
+
+ cnt += 1
+ KernelFilename = MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom) + '.c'
+ fpkernel = open(KernelFilename,'w')
+ defines['INCLUDE_HEADER'] = 1 # Include header first time in new file
+ DoHeader = 1
+
+ for KernelVF in VFList:
+
+ KernelName = MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF)
+
+ defines['KERNEL_NAME'] = KernelName
+ defines['KERNEL_VF'] = KernelVF
+
+ # Check if this is a valid/sane/usable combination
+ if not KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+ continue;
+
+ # The overall kernel settings determine what the _kernel_ calculates, but for the water
+ # kernels this does not mean that every pairwise interaction has e.g. Vdw interactions.
+ # This routine sets defines of what to calculate for each pair of particles in those cases.
+ SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines)
+
+ if(DoHeader==1):
+ fpkernel.write( FileHeader )
+
+ gmxpreprocess('nb_kernel_template_' + Arch + '.pre', KernelName+'.tmp' , defines, force=1,contentType='C')
+ numKernels = numKernels + 1
+
+ defines['INCLUDE_HEADER'] = 0 # Header has been included once now
+ DoHeader=0
+
+ # Append temp file contents to the common kernelfile
+ fptmp = open(KernelName+'.tmp','r')
+ fpkernel.writelines(fptmp.readlines())
+ fptmp.close()
+ os.remove(KernelName+'.tmp')
+
+ # Add a declaration for this kernel
+ fpdecl.write('nb_kernel_t ' + KernelName + ';\n');
+
+ # Add declaration to the buffer
+ KernelOther=''
+ kerneldecl.append(MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF))
+
+ filesize = fpkernel.tell()
+ fpkernel.close()
+ if(filesize==0):
+ os.remove(KernelFilename)
+
+ PrintStatistics(cnt/ntot)
+ pass
+ pass
+ pass
+ pass
+pass
+
+# Write out the list of settings and corresponding kernels to the declaration file
+fpdecl.write( '\n\n' )
+fpdecl.write( 'nb_kernel_info_t\n' )
+fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( '{\n' )
+for decl in kerneldecl[0:-1]:
+ fpdecl.write( decl + ',\n' )
+fpdecl.write( kerneldecl[-1] + '\n' )
+fpdecl.write( '};\n\n' )
+fpdecl.write( 'int\n' )
+fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( '#endif\n')
+fpdecl.close()
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 73 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 74 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*74);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 61 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 62 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 159 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 162 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*162);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 139 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 142 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*142);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq01,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq02,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 417 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq01,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq02,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 426 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*426);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 373 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 382 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*382);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq30,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 185 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq30,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 189 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*189);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 165 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 169 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*169);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq13,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq23,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq31,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq32,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq33,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 446 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq13,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq23,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq31,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq32,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq33,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 456 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*456);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 402 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 412 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*412);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 56 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 57 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*57);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 47 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 48 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*48);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 142 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 145 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 125 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 128 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*128);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq01,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq02,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 400 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq01,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq02,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 409 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*409);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 359 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 368 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*368);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq30,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 161 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq30,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 164 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*164);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 144 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 147 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq13,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq23,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq31,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq32,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq33,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 422 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq13,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq23,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq31,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq32,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq33,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 431 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*431);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 381 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 390 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*390);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 43 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 44 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 39 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 40 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 129 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 132 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 117 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 120 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq01,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq02,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 387 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq01,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq02,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 396 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 351 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r01,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r02,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 360 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq30,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 129 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq10,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq20,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq30,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 132 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 117 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r10,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r20,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r30,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 120 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq13,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq23,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq31,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq32,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq33,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 387 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq11,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq12,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq13,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq21,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq22,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq23,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq31,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq32,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq33,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 396 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 351 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r11,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r12,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r13,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r21,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r22,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r23,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r31,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r32,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r33,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,2);
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 360 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 63 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 64 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*64);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 54 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 55 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 119 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 120 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 109 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 287 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 288 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*288);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 270 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 271 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 140 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 141 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 129 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 130 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 311 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 312 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*312);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 294 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 295 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 40 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 40 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*40);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 34 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 34 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 96 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 96 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 88 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 88 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 264 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 264 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*264);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 250 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 250 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 116 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 116 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*116);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 287 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 287 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*287);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 273 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 273 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 28 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 28 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*28);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 27 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 27 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 84 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 84 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*84);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 252 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 252 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*252);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(velec,rinvsq00);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,rinv01);
+ felec = _mm_mul_ps(velec,rinvsq01);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,rinv02);
+ felec = _mm_mul_ps(velec,rinvsq02);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 84 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 84 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*84);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,rinv10);
+ felec = _mm_mul_ps(velec,rinvsq10);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,rinv20);
+ felec = _mm_mul_ps(velec,rinvsq20);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,rinv30);
+ felec = _mm_mul_ps(velec,rinvsq30);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 252 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 252 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*252);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,rinv11);
+ felec = _mm_mul_ps(velec,rinvsq11);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,rinv12);
+ felec = _mm_mul_ps(velec,rinvsq12);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,rinv13);
+ felec = _mm_mul_ps(velec,rinvsq13);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,rinv21);
+ felec = _mm_mul_ps(velec,rinvsq21);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,rinv22);
+ felec = _mm_mul_ps(velec,rinvsq22);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,rinv23);
+ felec = _mm_mul_ps(velec,rinvsq23);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,rinv31);
+ felec = _mm_mul_ps(velec,rinvsq31);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,rinv32);
+ felec = _mm_mul_ps(velec,rinvsq32);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,rinv33);
+ felec = _mm_mul_ps(velec,rinvsq33);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 64 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 65 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*65);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 46 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 47 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*47);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 156 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 159 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*159);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 124 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 127 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*127);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 432 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 441 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*441);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 358 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 367 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*367);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 179 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 182 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*182);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 147 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 150 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*150);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 458 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 467 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*467);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 384 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 393 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*393);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 46 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 47 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*47);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 39 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 40 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 138 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 141 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 117 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 120 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 414 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 423 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*423);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 351 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 360 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 138 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 141 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 117 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 120 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 414 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 423 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*423);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 351 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 360 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 83 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 84 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*84);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 77 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 78 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*78);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 213 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 216 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*216);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 201 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 204 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*204);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 603 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 612 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*612);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 573 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 582 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*582);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 254 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 258 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*258);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 242 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 246 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*246);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 647 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 657 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*657);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 617 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 627 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*627);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 65 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 66 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*66);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 62 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 63 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*63);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 195 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 198 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*198);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 186 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 189 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*189);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 585 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 594 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*594);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 558 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ d = _mm_sub_ps(r01,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ d = _mm_sub_ps(r02,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 567 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*567);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 195 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 198 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*198);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 186 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ d = _mm_sub_ps(r10,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ d = _mm_sub_ps(r20,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ d = _mm_sub_ps(r30,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 189 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*189);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 585 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ velec = _mm_mul_ps(velec,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 594 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*594);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 558 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ d = _mm_sub_ps(r11,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ d = _mm_sub_ps(r12,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ d = _mm_sub_ps(r13,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ d = _mm_sub_ps(r21,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ d = _mm_sub_ps(r22,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ d = _mm_sub_ps(r23,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ d = _mm_sub_ps(r31,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ d = _mm_sub_ps(r32,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ d = _mm_sub_ps(r33,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 567 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*567);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 75 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 76 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*76);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 62 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 63 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*63);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 157 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 160 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*160);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 134 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 137 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*137);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 403 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 412 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*412);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 350 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 359 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*359);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 179 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 183 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*183);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 156 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 160 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*160);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 428 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 438 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*438);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 375 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 385 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*385);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 53 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 54 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 43 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 44 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*44);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 135 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 138 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*138);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 115 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 118 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 381 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 390 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*390);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 331 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 340 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*340);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 155 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 158 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*158);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 135 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 138 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*138);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 404 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 413 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*413);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 354 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 363 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*363);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 41 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 42 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*42);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 36 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 37 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*37);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 123 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 126 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*126);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 111 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*111);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 369 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 378 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*378);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r00,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r01 = _mm_mul_ps(rsq01,rinv01);
+ r01 = _mm_andnot_ps(dummy_mask,r01);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r01,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r02 = _mm_mul_ps(rsq02,rinv02);
+ r02 = _mm_andnot_ps(dummy_mask,r02);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r02,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 333 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*333);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 123 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 126 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*126);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r10 = _mm_mul_ps(rsq10,rinv10);
+ r10 = _mm_andnot_ps(dummy_mask,r10);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r10,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r20 = _mm_mul_ps(rsq20,rinv20);
+ r20 = _mm_andnot_ps(dummy_mask,r20);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r20,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r30 = _mm_mul_ps(rsq30,rinv30);
+ r30 = _mm_andnot_ps(dummy_mask,r30);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r30,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 111 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*111);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 369 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 378 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*378);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r11 = _mm_mul_ps(rsq11,rinv11);
+ r11 = _mm_andnot_ps(dummy_mask,r11);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r11,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r12 = _mm_mul_ps(rsq12,rinv12);
+ r12 = _mm_andnot_ps(dummy_mask,r12);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r12,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r13 = _mm_mul_ps(rsq13,rinv13);
+ r13 = _mm_andnot_ps(dummy_mask,r13);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r13,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r21 = _mm_mul_ps(rsq21,rinv21);
+ r21 = _mm_andnot_ps(dummy_mask,r21);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r21,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r22 = _mm_mul_ps(rsq22,rinv22);
+ r22 = _mm_andnot_ps(dummy_mask,r22);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r22,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r23 = _mm_mul_ps(rsq23,rinv23);
+ r23 = _mm_andnot_ps(dummy_mask,r23);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r23,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r31 = _mm_mul_ps(rsq31,rinv31);
+ r31 = _mm_andnot_ps(dummy_mask,r31);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r31,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r32 = _mm_mul_ps(rsq32,rinv32);
+ r32 = _mm_andnot_ps(dummy_mask,r32);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r32,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r33 = _mm_mul_ps(rsq33,rinv33);
+ r33 = _mm_andnot_ps(dummy_mask,r33);
+
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r33,ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 333 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*333);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ isai0 = _mm_load1_ps(invsqrta+inr+0);
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vgbsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+ dvdasum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 92 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vgb = _mm_andnot_ps(dummy_mask,vgb);
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 93 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 13 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*93);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ isai0 = _mm_load1_ps(invsqrta+inr+0);
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ dvdasum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 82 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 83 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*83);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ isai0 = _mm_load1_ps(invsqrta+inr+0);
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vgbsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+ dvdasum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 71 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vgb = _mm_andnot_ps(dummy_mask,vgb);
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 72 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 13 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*72);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ isai0 = _mm_load1_ps(invsqrta+inr+0);
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ dvdasum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 64 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 65 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*65);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ isai0 = _mm_load1_ps(invsqrta+inr+0);
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vgbsum = _mm_setzero_ps();
+ dvdasum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 58 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vgb = _mm_andnot_ps(dummy_mask,vgb);
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 59 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*12 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ isai0 = _mm_load1_ps(invsqrta+inr+0);
+
+ dvdasum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 56 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+ invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai0,isaj0);
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r00,gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ velec = _mm_mul_ps(qq00,rinv00);
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 57 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*57);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 56 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 57 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*57);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 48 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 49 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 9 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*49);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ rcutoff_scalar = fr->rvdw;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 41 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 41 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*41);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ rcutoff_scalar = fr->rvdw;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 30 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 30 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 9 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*30);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ rcutoff_scalar = fr->rvdw;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 59 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 60 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*60);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ rcutoff_scalar = fr->rvdw;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 56 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 57 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 9 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*57);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 32 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 32 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*32);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 27 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+
+ /* Load parameters for j particles */
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 27 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 9 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*27);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 72 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 73 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*73);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 57 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 58 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*58);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 144 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 145 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 117 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 118 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 360 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 361 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*361);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 297 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 298 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*298);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 164 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 165 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*165);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 138 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 139 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*139);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 383 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 384 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*384);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 321 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 322 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*322);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 54 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 54 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 37 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 37 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*37);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 126 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 126 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*126);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 97 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 97 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*97);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 342 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 342 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*342);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 277 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 277 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*277);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 149 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 149 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*149);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 120 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 120 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*120);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 368 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 368 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*368);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 303 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 303 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*303);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 70 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 71 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*71);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 61 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 62 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 142 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 143 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*143);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 121 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 122 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*122);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 358 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 359 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*359);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 301 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 302 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*302);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 167 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 168 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*168);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 146 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 147 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 386 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ vvdw = _mm_mul_ps(vvdw,sw);
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 387 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*387);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 329 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ d = _mm_sub_ps(r00,rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+ /* Evaluate switch function */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = fvdw;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 330 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*330);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 36 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 36 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*36);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 30 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 30 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*30);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 108 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 90 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 90 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*90);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 270 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq01,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq02,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 270 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*270);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 108 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 90 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq10,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq20,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq30,rcutoff2))
+ {
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ }
+
+ /* Inner loop uses 90 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*90);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 270 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq11,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq12,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq13,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq21,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq22,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq23,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq31,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq32,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ }
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ if (gmx_mm_any_lt(rsq33,rcutoff2))
+ {
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
+
+ fscal = felec;
+
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ }
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 270 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*270);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 67 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 68 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*68);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 54 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 55 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 131 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 132 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 109 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 323 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 324 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 270 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 271 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 152 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 153 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*153);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 129 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 130 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 347 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_00,VV);
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 348 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*348);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: CubicSplineTable
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 294 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ r00 = _mm_mul_ps(rsq00,rinv00);
+ r00 = _mm_andnot_ps(dummy_mask,r00);
+
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r00,vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ vfitab = _mm_slli_epi32(vfitab,3);
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_00,FF);
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_00,FF);
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 295 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 44 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 44 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 12 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 34 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 34 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 88 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 88 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 300 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 300 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 29 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*300);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ qq00 = _mm_mul_ps(iq0,jq0);
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 250 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = _mm_add_ps(felec,fvdw);
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 250 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 128 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 128 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*128);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+ vdwjidx0A = 2*vdwtype[jnrA+0];
+ vdwjidx0B = 2*vdwtype[jnrB+0];
+ vdwjidx0C = 2*vdwtype[jnrC+0];
+ vdwjidx0D = 2*vdwtype[jnrD+0];
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+ vdwparam+vdwioffset0+vdwjidx0B,
+ vdwparam+vdwioffset0+vdwjidx0C,
+ vdwparam+vdwioffset0+vdwjidx0D,
+ &c6_00,&c12_00);
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 108 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+ vvdwsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 323 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ vvdw6 = _mm_mul_ps(c6_00,rinvsix);
+ vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 323 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 38 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*323);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: LennardJones
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+ vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ vdwjidx0A = 2*vdwtype[inr+0];
+ c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+ c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 273 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq00 = gmx_mm_inv_ps(rsq00);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+ fscal = fvdw;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 273 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 36 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 32 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 32 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 11 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*32);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Particle-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+
+ /* Load parameters for i particles */
+ iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 27 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 27 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 10 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 96 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 96 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq00 = _mm_mul_ps(iq0,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 288 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 288 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*288);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water3-Water3
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset0;
+ __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+ __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+ __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+ jq0 = _mm_set1_ps(charge[inr+0]);
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ qq00 = _mm_mul_ps(iq0,jq0);
+ qq01 = _mm_mul_ps(iq0,jq1);
+ qq02 = _mm_mul_ps(iq0,jq2);
+ qq10 = _mm_mul_ps(iq1,jq0);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq20 = _mm_mul_ps(iq2,jq0);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+ iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+ iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+ fix0 = _mm_setzero_ps();
+ fiy0 = _mm_setzero_ps();
+ fiz0 = _mm_setzero_ps();
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+ /* Calculate displacement vector */
+ dx00 = _mm_sub_ps(ix0,jx0);
+ dy00 = _mm_sub_ps(iy0,jy0);
+ dz00 = _mm_sub_ps(iz0,jz0);
+ dx01 = _mm_sub_ps(ix0,jx1);
+ dy01 = _mm_sub_ps(iy0,jy1);
+ dz01 = _mm_sub_ps(iz0,jz1);
+ dx02 = _mm_sub_ps(ix0,jx2);
+ dy02 = _mm_sub_ps(iy0,jy2);
+ dz02 = _mm_sub_ps(iz0,jz2);
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+
+ /* Calculate squared distance and things based on it */
+ rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+ rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+ rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+ rinv00 = gmx_mm_invsqrt_ps(rsq00);
+ rinv01 = gmx_mm_invsqrt_ps(rsq01);
+ rinv02 = gmx_mm_invsqrt_ps(rsq02);
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+
+ rinvsq00 = _mm_mul_ps(rinv00,rinv00);
+ rinvsq01 = _mm_mul_ps(rinv01,rinv01);
+ rinvsq02 = _mm_mul_ps(rinv02,rinv02);
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+
+ fjx0 = _mm_setzero_ps();
+ fjy0 = _mm_setzero_ps();
+ fjz0 = _mm_setzero_ps();
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx00);
+ ty = _mm_mul_ps(fscal,dy00);
+ tz = _mm_mul_ps(fscal,dz00);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx01);
+ ty = _mm_mul_ps(fscal,dy01);
+ tz = _mm_mul_ps(fscal,dz01);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx02);
+ ty = _mm_mul_ps(fscal,dy02);
+ tz = _mm_mul_ps(fscal,dz02);
+
+ /* Update vectorial force */
+ fix0 = _mm_add_ps(fix0,tx);
+ fiy0 = _mm_add_ps(fiy0,ty);
+ fiz0 = _mm_add_ps(fiz0,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx0 = _mm_add_ps(fjx0,tx);
+ fjy0 = _mm_add_ps(fjy0,ty);
+ fjz0 = _mm_add_ps(fjz0,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 96 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 96 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Particle
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+ __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+ __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+ __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+ __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+
+ /* Calculate displacement vector */
+ dx10 = _mm_sub_ps(ix1,jx0);
+ dy10 = _mm_sub_ps(iy1,jy0);
+ dz10 = _mm_sub_ps(iz1,jz0);
+ dx20 = _mm_sub_ps(ix2,jx0);
+ dy20 = _mm_sub_ps(iy2,jy0);
+ dz20 = _mm_sub_ps(iz2,jz0);
+ dx30 = _mm_sub_ps(ix3,jx0);
+ dy30 = _mm_sub_ps(iy3,jy0);
+ dz30 = _mm_sub_ps(iz3,jz0);
+
+ /* Calculate squared distance and things based on it */
+ rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+ rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+ rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+ rinv10 = gmx_mm_invsqrt_ps(rsq10);
+ rinv20 = gmx_mm_invsqrt_ps(rsq20);
+ rinv30 = gmx_mm_invsqrt_ps(rsq30);
+
+ rinvsq10 = _mm_mul_ps(rinv10,rinv10);
+ rinvsq20 = _mm_mul_ps(rinv20,rinv20);
+ rinvsq30 = _mm_mul_ps(rinv30,rinv30);
+
+ /* Load parameters for j particles */
+ jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+ charge+jnrC+0,charge+jnrD+0);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq10 = _mm_mul_ps(iq1,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx10);
+ ty = _mm_mul_ps(fscal,dy10);
+ tz = _mm_mul_ps(fscal,dz10);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq20 = _mm_mul_ps(iq2,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx20);
+ ty = _mm_mul_ps(fscal,dy20);
+ tz = _mm_mul_ps(fscal,dz20);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* Compute parameters for interactions between i and j atoms */
+ qq30 = _mm_mul_ps(iq3,jq0);
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx30);
+ ty = _mm_mul_ps(fscal,dy30);
+ tz = _mm_mul_ps(fscal,dz30);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+
+ /* Inner loop uses 81 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Reset potential sums */
+ velecsum = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 288 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ velecsum = _mm_add_ps(velecsum,velec);
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 288 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ ggid = gid[iidx];
+ /* Update potential energies */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 28 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*288);
+}
+/*
+ * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction: None
+ * Geometry: Water4-Water4
+ * Calculate force/pot: Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ int vdwioffset1;
+ __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+ int vdwioffset2;
+ __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+ int vdwioffset3;
+ __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+ int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+ __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+ int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+ __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+ int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+ __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+ __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+ __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+ __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+ __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+ __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+ __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+ __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+ __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+ __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+ iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+ iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+ jq1 = _mm_set1_ps(charge[inr+1]);
+ jq2 = _mm_set1_ps(charge[inr+2]);
+ jq3 = _mm_set1_ps(charge[inr+3]);
+ qq11 = _mm_mul_ps(iq1,jq1);
+ qq12 = _mm_mul_ps(iq1,jq2);
+ qq13 = _mm_mul_ps(iq1,jq3);
+ qq21 = _mm_mul_ps(iq2,jq1);
+ qq22 = _mm_mul_ps(iq2,jq2);
+ qq23 = _mm_mul_ps(iq2,jq3);
+ qq31 = _mm_mul_ps(iq3,jq1);
+ qq32 = _mm_mul_ps(iq3,jq2);
+ qq33 = _mm_mul_ps(iq3,jq3);
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+ iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+ iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+ ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+ iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+ iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+ iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+ iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+ fix1 = _mm_setzero_ps();
+ fiy1 = _mm_setzero_ps();
+ fiz1 = _mm_setzero_ps();
+ fix2 = _mm_setzero_ps();
+ fiy2 = _mm_setzero_ps();
+ fiz2 = _mm_setzero_ps();
+ fix3 = _mm_setzero_ps();
+ fiy3 = _mm_setzero_ps();
+ fiz3 = _mm_setzero_ps();
+
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ fscal = felec;
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ if(jidx<j_index_end)
+ {
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+ /* Calculate displacement vector */
+ dx11 = _mm_sub_ps(ix1,jx1);
+ dy11 = _mm_sub_ps(iy1,jy1);
+ dz11 = _mm_sub_ps(iz1,jz1);
+ dx12 = _mm_sub_ps(ix1,jx2);
+ dy12 = _mm_sub_ps(iy1,jy2);
+ dz12 = _mm_sub_ps(iz1,jz2);
+ dx13 = _mm_sub_ps(ix1,jx3);
+ dy13 = _mm_sub_ps(iy1,jy3);
+ dz13 = _mm_sub_ps(iz1,jz3);
+ dx21 = _mm_sub_ps(ix2,jx1);
+ dy21 = _mm_sub_ps(iy2,jy1);
+ dz21 = _mm_sub_ps(iz2,jz1);
+ dx22 = _mm_sub_ps(ix2,jx2);
+ dy22 = _mm_sub_ps(iy2,jy2);
+ dz22 = _mm_sub_ps(iz2,jz2);
+ dx23 = _mm_sub_ps(ix2,jx3);
+ dy23 = _mm_sub_ps(iy2,jy3);
+ dz23 = _mm_sub_ps(iz2,jz3);
+ dx31 = _mm_sub_ps(ix3,jx1);
+ dy31 = _mm_sub_ps(iy3,jy1);
+ dz31 = _mm_sub_ps(iz3,jz1);
+ dx32 = _mm_sub_ps(ix3,jx2);
+ dy32 = _mm_sub_ps(iy3,jy2);
+ dz32 = _mm_sub_ps(iz3,jz2);
+ dx33 = _mm_sub_ps(ix3,jx3);
+ dy33 = _mm_sub_ps(iy3,jy3);
+ dz33 = _mm_sub_ps(iz3,jz3);
+
+ /* Calculate squared distance and things based on it */
+ rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+ rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+ rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+ rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+ rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+ rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+ rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+ rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+ rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+ rinv11 = gmx_mm_invsqrt_ps(rsq11);
+ rinv12 = gmx_mm_invsqrt_ps(rsq12);
+ rinv13 = gmx_mm_invsqrt_ps(rsq13);
+ rinv21 = gmx_mm_invsqrt_ps(rsq21);
+ rinv22 = gmx_mm_invsqrt_ps(rsq22);
+ rinv23 = gmx_mm_invsqrt_ps(rsq23);
+ rinv31 = gmx_mm_invsqrt_ps(rsq31);
+ rinv32 = gmx_mm_invsqrt_ps(rsq32);
+ rinv33 = gmx_mm_invsqrt_ps(rsq33);
+
+ rinvsq11 = _mm_mul_ps(rinv11,rinv11);
+ rinvsq12 = _mm_mul_ps(rinv12,rinv12);
+ rinvsq13 = _mm_mul_ps(rinv13,rinv13);
+ rinvsq21 = _mm_mul_ps(rinv21,rinv21);
+ rinvsq22 = _mm_mul_ps(rinv22,rinv22);
+ rinvsq23 = _mm_mul_ps(rinv23,rinv23);
+ rinvsq31 = _mm_mul_ps(rinv31,rinv31);
+ rinvsq32 = _mm_mul_ps(rinv32,rinv32);
+ rinvsq33 = _mm_mul_ps(rinv33,rinv33);
+
+ fjx1 = _mm_setzero_ps();
+ fjy1 = _mm_setzero_ps();
+ fjz1 = _mm_setzero_ps();
+ fjx2 = _mm_setzero_ps();
+ fjy2 = _mm_setzero_ps();
+ fjz2 = _mm_setzero_ps();
+ fjx3 = _mm_setzero_ps();
+ fjy3 = _mm_setzero_ps();
+ fjz3 = _mm_setzero_ps();
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx11);
+ ty = _mm_mul_ps(fscal,dy11);
+ tz = _mm_mul_ps(fscal,dz11);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx12);
+ ty = _mm_mul_ps(fscal,dy12);
+ tz = _mm_mul_ps(fscal,dz12);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx13);
+ ty = _mm_mul_ps(fscal,dy13);
+ tz = _mm_mul_ps(fscal,dz13);
+
+ /* Update vectorial force */
+ fix1 = _mm_add_ps(fix1,tx);
+ fiy1 = _mm_add_ps(fiy1,ty);
+ fiz1 = _mm_add_ps(fiz1,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx21);
+ ty = _mm_mul_ps(fscal,dy21);
+ tz = _mm_mul_ps(fscal,dz21);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx22);
+ ty = _mm_mul_ps(fscal,dy22);
+ tz = _mm_mul_ps(fscal,dz22);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx23);
+ ty = _mm_mul_ps(fscal,dy23);
+ tz = _mm_mul_ps(fscal,dz23);
+
+ /* Update vectorial force */
+ fix2 = _mm_add_ps(fix2,tx);
+ fiy2 = _mm_add_ps(fiy2,ty);
+ fiz2 = _mm_add_ps(fiz2,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx31);
+ ty = _mm_mul_ps(fscal,dy31);
+ tz = _mm_mul_ps(fscal,dz31);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx1 = _mm_add_ps(fjx1,tx);
+ fjy1 = _mm_add_ps(fjy1,ty);
+ fjz1 = _mm_add_ps(fjz1,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx32);
+ ty = _mm_mul_ps(fscal,dy32);
+ tz = _mm_mul_ps(fscal,dz32);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx2 = _mm_add_ps(fjx2,tx);
+ fjy2 = _mm_add_ps(fjy2,ty);
+ fjz2 = _mm_add_ps(fjz2,tz);
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+ fscal = felec;
+
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx33);
+ ty = _mm_mul_ps(fscal,dy33);
+ tz = _mm_mul_ps(fscal,dz33);
+
+ /* Update vectorial force */
+ fix3 = _mm_add_ps(fix3,tx);
+ fiy3 = _mm_add_ps(fiy3,ty);
+ fiz3 = _mm_add_ps(fiz3,tz);
+
+ fjx3 = _mm_add_ps(fjx3,tx);
+ fjy3 = _mm_add_ps(fjy3,ty);
+ fjz3 = _mm_add_ps(fjz3,tz);
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+ /* Inner loop uses 243 flops */
+ }
+
+ /* End of innermost loop */
+
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses 27 flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+}
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifndef nb_kernel_sse2_single_h
+#define nb_kernel_sse2_single_h
+
+#include "../nb_kernel.h"
+
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single;
+
+
+nb_kernel_info_t
+kernellist_sse2_single[] =
+{
+ { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+ { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
+};
+
+int
+kernellist_sse2_single_size = sizeof(kernellist_sse2_single)/sizeof(kernellist_sse2_single[0]);
+
+#endif
--- /dev/null
+/*
+ * Note: this file was generated by the Gromacs c kernel generator.
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifndef nb_kernel_sse2_single_h
+#define nb_kernel_sse2_single_h
+
+#include "../nb_kernel.h"
+
+
+/* List of kernels for this architecture with metadata about them */
+extern nb_kernel_info_t
+kernellist_sse2_single[];
+
+/* Length of kernellist_c */
+extern int
+kernellist_sse2_single_size;
+
+#endif
--- /dev/null
+/* #if 0 */
+#error This file must be processed with the Gromacs pre-preprocessor
+/* #endif */
+/* #if INCLUDE_HEADER */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+/* #endif */
+
+/* ## List of variables set by the generating script: */
+/* ## */
+/* ## Setttings that apply to the entire kernel: */
+/* ## KERNEL_ELEC: String, choice for electrostatic interactions */
+/* ## KERNEL_VDW: String, choice for van der Waals interactions */
+/* ## KERNEL_NAME: String, name of this kernel */
+/* ## KERNEL_VF: String telling if we calculate potential, force, or both */
+/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
+/* ## */
+/* ## Setttings that apply to particles in the outer (I) or inner (J) loops: */
+/* ## PARTICLES_I[]/ Arrays with lists of i/j particles to use in kernel. It is */
+/* ## PARTICLES_J[]: just [0] for particle geometry, but can be longer for water */
+/* ## PARTICLES_ELEC_I[]/ Arrays with lists of i/j particle that have electrostatics */
+/* ## PARTICLES_ELEC_J[]: interactions that should be calculated in this kernel. */
+/* ## PARTICLES_VDW_I[]/ Arrays with the list of i/j particle that have VdW */
+/* ## PARTICLES_VDW_J[]: interactions that should be calculated in this kernel. */
+/* ## */
+/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle) */
+/* ## PAIRS_IJ[]: Array with (i,j) tuples of pairs for which interactions */
+/* ## should be calculated in this kernel. Zero-charge particles */
+/* ## do not have interactions with particles without vdw, and */
+/* ## Vdw-only interactions are not evaluated in a no-vdw-kernel. */
+/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
+/* ## For each i-j pair, the element [I][J] is a list of strings */
+/* ## defining properties/flags of this interaction. Examples */
+/* ## include 'electrostatics'/'vdw' if that type of interaction */
+/* ## should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values */
+/* ## are needed, and 'exactcutoff' or 'shift','switch' to */
+/* ## decide if the force/potential should be modified. This way */
+/* ## we only calculate values absolutely needed for each case. */
+
+/* ## Calculate the size and offset for (merged/interleaved) table data */
+
+/*
+ * Gromacs nonbonded kernel: {KERNEL_NAME}
+ * Electrostatics interaction: {KERNEL_ELEC}
+ * VdW interaction: {KERNEL_VDW}
+ * Geometry: {GEOMETRY_I}-{GEOMETRY_J}
+ * Calculate force/pot: {KERNEL_VF}
+ */
+void
+{KERNEL_NAME}
+ (t_nblist * gmx_restrict nlist,
+ rvec * gmx_restrict xx,
+ rvec * gmx_restrict ff,
+ t_forcerec * gmx_restrict fr,
+ t_mdatoms * gmx_restrict mdatoms,
+ nb_kernel_data_t * gmx_restrict kernel_data,
+ t_nrnb * gmx_restrict nrnb)
+{
+ /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
+ /* ## so there is no point in going to extremes to exclude variables that are not needed. */
+ /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+ * just 0 for non-waters.
+ * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+ * jnr indices corresponding to data put in the four positions in the SIMD register.
+ */
+ int i_shift_offset,i_coord_offset,outeriter,inneriter;
+ int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+ int jnrA,jnrB,jnrC,jnrD;
+ int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+ int *iinr,*jindex,*jjnr,*shiftidx,*gid;
+ real shX,shY,shZ,rcutoff_scalar;
+ real *shiftvec,*fshift,*x,*f;
+ __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+ /* #for I in PARTICLES_I */
+ int vdwioffset{I};
+ __m128 ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
+ /* #endfor */
+ /* #for J in PARTICLES_J */
+ int vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
+ __m128 jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
+ /* #endfor */
+ /* #for I,J in PAIRS_IJ */
+ __m128 dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
+ /* #endfor */
+ /* #if KERNEL_ELEC != 'None' */
+ __m128 velec,felec,velecsum,facel,crf,krf,krf2;
+ real *charge;
+ /* #endif */
+ /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+ __m128i gbitab;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 minushalf = _mm_set1_ps(-0.5);
+ real *invsqrta,*dvda,*gbtab;
+ /* #endif */
+ /* #if KERNEL_VDW != 'None' */
+ int nvdwtype;
+ __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+ int *vdwtype;
+ real *vdwparam;
+ __m128 one_sixth = _mm_set1_ps(1.0/6.0);
+ __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
+ /* #endif */
+ /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
+ __m128i vfitab;
+ __m128i ifour = _mm_set1_epi32(4);
+ __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+ real *vftab;
+ /* #endif */
+ /* #if 'Ewald' in KERNEL_ELEC */
+ __m128i ewitab;
+ __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+ real *ewtab;
+ /* #endif */
+ /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+ __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+ real rswitch_scalar,d_scalar;
+ /* #endif */
+ __m128 dummy_mask,cutoff_mask;
+ __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+ __m128 one = _mm_set1_ps(1.0);
+ __m128 two = _mm_set1_ps(2.0);
+ x = xx[0];
+ f = ff[0];
+
+ nri = nlist->nri;
+ iinr = nlist->iinr;
+ jindex = nlist->jindex;
+ jjnr = nlist->jjnr;
+ shiftidx = nlist->shift;
+ gid = nlist->gid;
+ shiftvec = fr->shift_vec[0];
+ fshift = fr->fshift[0];
+ /* #if KERNEL_ELEC != 'None' */
+ facel = _mm_set1_ps(fr->epsfac);
+ charge = mdatoms->chargeA;
+ /* #if 'ReactionField' in KERNEL_ELEC */
+ krf = _mm_set1_ps(fr->ic->k_rf);
+ krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
+ crf = _mm_set1_ps(fr->ic->c_rf);
+ /* #endif */
+ /* #endif */
+ /* #if KERNEL_VDW != 'None' */
+ nvdwtype = fr->ntype;
+ vdwparam = fr->nbfp;
+ vdwtype = mdatoms->typeA;
+ /* #endif */
+
+ /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
+ vftab = kernel_data->table_elec_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+ /* #elif 'Table' in KERNEL_ELEC */
+ vftab = kernel_data->table_elec->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
+ /* #elif 'Table' in KERNEL_VDW */
+ vftab = kernel_data->table_vdw->data;
+ vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
+ /* #endif */
+
+ /* #if 'Ewald' in KERNEL_ELEC */
+ sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
+ /* #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
+ ewtab = fr->ic->tabq_coul_F;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+ /* #else */
+ ewtab = fr->ic->tabq_coul_FDV0;
+ ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
+ ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+ /* #endif */
+ /* #endif */
+
+ /* #if KERNEL_ELEC=='GeneralizedBorn' */
+ invsqrta = fr->invsqrta;
+ dvda = fr->dvda;
+ gbtabscale = _mm_set1_ps(fr->gbtab.scale);
+ gbtab = fr->gbtab.data;
+ gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+ /* #endif */
+
+ /* #if 'Water' in GEOMETRY_I */
+ /* Setup water-specific parameters */
+ inr = nlist->iinr[0];
+ /* #for I in PARTICLES_ELEC_I */
+ iq{I} = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+{I}]));
+ /* #endfor */
+ /* #for I in PARTICLES_VDW_I */
+ vdwioffset{I} = 2*nvdwtype*vdwtype[inr+{I}];
+ /* #endfor */
+ /* #endif */
+
+ /* #if 'Water' in GEOMETRY_J */
+ /* #for J in PARTICLES_ELEC_J */
+ jq{J} = _mm_set1_ps(charge[inr+{J}]);
+ /* #endfor */
+ /* #for J in PARTICLES_VDW_J */
+ vdwjidx{J}A = 2*vdwtype[inr+{J}];
+ /* #endfor */
+ /* #for I,J in PAIRS_IJ */
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+ qq{I}{J} = _mm_mul_ps(iq{I},jq{J});
+ /* #endif */
+ /* #if 'vdw' in INTERACTION_FLAGS[I][J] */
+ c6_{I}{J} = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+ c12_{I}{J} = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+ /* #endif */
+ /* #endfor */
+ /* #endif */
+
+ /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
+ /* #if KERNEL_ELEC!='None' */
+ /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+ rcutoff_scalar = fr->rcoulomb;
+ /* #else */
+ rcutoff_scalar = fr->rvdw;
+ /* #endif */
+ rcutoff = _mm_set1_ps(rcutoff_scalar);
+ rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
+ /* #endif */
+
+ /* #if KERNEL_MOD_VDW=='PotentialShift' */
+ sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
+ rvdw = _mm_set1_ps(fr->rvdw);
+ /* #endif */
+
+ /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+ /* #if KERNEL_MOD_ELEC=='PotentialSwitch' */
+ rswitch_scalar = fr->rcoulomb_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* #else */
+ rswitch_scalar = fr->rvdw_switch;
+ rswitch = _mm_set1_ps(rswitch_scalar);
+ /* #endif */
+ /* Setup switch parameters */
+ d_scalar = rcutoff_scalar-rswitch_scalar;
+ d = _mm_set1_ps(d_scalar);
+ swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+ swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ /* #if 'Force' in KERNEL_VF */
+ swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+ swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+ swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+ /* #endif */
+ /* #endif */
+
+ /* Avoid stupid compiler warnings */
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j_coord_offsetA = 0;
+ j_coord_offsetB = 0;
+ j_coord_offsetC = 0;
+ j_coord_offsetD = 0;
+
+ /* ## Keep track of the floating point operations we issue for reporting! */
+ /* #define OUTERFLOPS 0 */
+ outeriter = 0;
+ inneriter = 0;
+
+ /* Start outer loop over neighborlists */
+ for(iidx=0; iidx<nri; iidx++)
+ {
+ /* Load shift vector for this list */
+ i_shift_offset = DIM*shiftidx[iidx];
+ shX = shiftvec[i_shift_offset+XX];
+ shY = shiftvec[i_shift_offset+YY];
+ shZ = shiftvec[i_shift_offset+ZZ];
+
+ /* Load limits for loop over neighbors */
+ j_index_start = jindex[iidx];
+ j_index_end = jindex[iidx+1];
+
+ /* Get outer coordinate index */
+ inr = iinr[iidx];
+ i_coord_offset = DIM*inr;
+
+ /* Load i particle coords and add shift vector */
+ /* ## Loop over i particles, but only include ones that we use - skip e.g. vdw-only sites for elec-only kernel */
+ /* #for I in PARTICLES_I */
+ ix{I} = _mm_set1_ps(shX + x[i_coord_offset+DIM*{I}+XX]);
+ iy{I} = _mm_set1_ps(shY + x[i_coord_offset+DIM*{I}+YY]);
+ iz{I} = _mm_set1_ps(shZ + x[i_coord_offset+DIM*{I}+ZZ]);
+ /* #define OUTERFLOPS OUTERFLOPS+3 */
+ /* #endfor */
+
+ /* #if 'Force' in KERNEL_VF */
+ /* #for I in PARTICLES_I */
+ fix{I} = _mm_setzero_ps();
+ fiy{I} = _mm_setzero_ps();
+ fiz{I} = _mm_setzero_ps();
+ /* #endfor */
+ /* #endif */
+
+ /* ## For water we already preloaded parameters at the start of the kernel */
+ /* #if not 'Water' in GEOMETRY_I */
+ /* Load parameters for i particles */
+ /* #for I in PARTICLES_ELEC_I */
+ iq{I} = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+{I}));
+ /* #define OUTERFLOPS OUTERFLOPS+1 */
+ /* #if KERNEL_ELEC=='GeneralizedBorn' */
+ isai{I} = _mm_load1_ps(invsqrta+inr+{I});
+ /* #endif */
+ /* #endfor */
+ /* #for I in PARTICLES_VDW_I */
+ vdwioffset{I} = 2*nvdwtype*vdwtype[inr+{I}];
+ /* #endfor */
+ /* #endif */
+
+ /* #if 'Potential' in KERNEL_VF */
+ /* Reset potential sums */
+ /* #if KERNEL_ELEC != 'None' */
+ velecsum = _mm_setzero_ps();
+ /* #endif */
+ /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+ vgbsum = _mm_setzero_ps();
+ /* #endif */
+ /* #if KERNEL_VDW != 'None' */
+ vvdwsum = _mm_setzero_ps();
+ /* #endif */
+ /* #endif */
+ /* #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+ dvdasum = _mm_setzero_ps();
+ /* #endif */
+
+ /* #for ROUND in ['Loop','Epilogue'] */
+
+ /* #if ROUND =='Loop' */
+ /* Start inner kernel loop */
+ for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+ {
+ /* ## First round is normal loop (next statement resets indentation) */
+ /* #if 0 */
+ }
+ /* #endif */
+ /* #else */
+ if(jidx<j_index_end)
+ {
+ /* ## Second round is epilogue */
+ /* #endif */
+ /* #define INNERFLOPS 0 */
+
+ /* Get j neighbor index, and coordinate index */
+ jnrA = jjnr[jidx];
+ jnrB = jjnr[jidx+1];
+ jnrC = jjnr[jidx+2];
+ jnrD = jjnr[jidx+3];
+
+ /* #if ROUND =='Epilogue' */
+ /* Sign of each element will be negative for non-real atoms.
+ * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+ * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+ */
+ dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ jnrA = (jnrA>=0) ? jnrA : 0;
+ jnrB = (jnrB>=0) ? jnrB : 0;
+ jnrC = (jnrC>=0) ? jnrC : 0;
+ jnrD = (jnrD>=0) ? jnrD : 0;
+
+ /* #endif */
+ j_coord_offsetA = DIM*jnrA;
+ j_coord_offsetB = DIM*jnrB;
+ j_coord_offsetC = DIM*jnrC;
+ j_coord_offsetD = DIM*jnrD;
+
+ /* load j atom coordinates */
+ /* #if GEOMETRY_J == 'Particle' */
+ gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0);
+ /* #elif GEOMETRY_J == 'Water3' */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+ /* #elif GEOMETRY_J == 'Water4' */
+ /* #if 0 in PARTICLES_J */
+ gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+ x+j_coord_offsetC,x+j_coord_offsetD,
+ &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+ &jy2,&jz2,&jx3,&jy3,&jz3);
+ /* #else */
+ gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+ x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+ &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+ /* #endif */
+ /* #endif */
+
+ /* Calculate displacement vector */
+ /* #for I,J in PAIRS_IJ */
+ dx{I}{J} = _mm_sub_ps(ix{I},jx{J});
+ dy{I}{J} = _mm_sub_ps(iy{I},jy{J});
+ dz{I}{J} = _mm_sub_ps(iz{I},jz{J});
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endfor */
+
+ /* Calculate squared distance and things based on it */
+ /* #for I,J in PAIRS_IJ */
+ rsq{I}{J} = gmx_mm_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+5 */
+ /* #endfor */
+
+ /* #for I,J in PAIRS_IJ */
+ /* #if 'rinv' in INTERACTION_FLAGS[I][J] */
+ rinv{I}{J} = gmx_mm_invsqrt_ps(rsq{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+5 */
+ /* #endif */
+ /* #endfor */
+
+ /* #for I,J in PAIRS_IJ */
+ /* #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
+ /* # if 'rinv' not in INTERACTION_FLAGS[I][J] */
+ rinvsq{I}{J} = gmx_mm_inv_ps(rsq{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #else */
+ rinvsq{I}{J} = _mm_mul_ps(rinv{I}{J},rinv{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #endif */
+ /* #endfor */
+
+ /* #if not 'Water' in GEOMETRY_J */
+ /* Load parameters for j particles */
+ /* #for J in PARTICLES_ELEC_J */
+ jq{J} = gmx_mm_load_4real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
+ charge+jnrC+{J},charge+jnrD+{J});
+ /* #if KERNEL_ELEC=='GeneralizedBorn' */
+ isaj{J} = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
+ invsqrta+jnrC+{J},invsqrta+jnrD+{J});
+ /* #endif */
+ /* #endfor */
+ /* #for J in PARTICLES_VDW_J */
+ vdwjidx{J}A = 2*vdwtype[jnrA+{J}];
+ vdwjidx{J}B = 2*vdwtype[jnrB+{J}];
+ vdwjidx{J}C = 2*vdwtype[jnrC+{J}];
+ vdwjidx{J}D = 2*vdwtype[jnrD+{J}];
+ /* #endfor */
+ /* #endif */
+
+ /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_J */
+ /* #for J in PARTICLES_J */
+ fjx{J} = _mm_setzero_ps();
+ fjy{J} = _mm_setzero_ps();
+ fjz{J} = _mm_setzero_ps();
+ /* #endfor */
+ /* #endif */
+
+ /* #for I,J in PAIRS_IJ */
+
+ /**************************
+ * CALCULATE INTERACTIONS *
+ **************************/
+
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
+ if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
+ {
+ /* #if 0 ## this and the next two lines is a hack to maintain auto-indentation in template file */
+ }
+ /* #endif */
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+
+ /* #if 'r' in INTERACTION_FLAGS[I][J] */
+ r{I}{J} = _mm_mul_ps(rsq{I}{J},rinv{I}{J});
+ /* #if ROUND == 'Epilogue' */
+ r{I}{J} = _mm_andnot_ps(dummy_mask,r{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+
+ /* ## For water geometries we already loaded parameters at the start of the kernel */
+ /* #if not 'Water' in GEOMETRY_J */
+ /* Compute parameters for interactions between i and j atoms */
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+ qq{I}{J} = _mm_mul_ps(iq{I},jq{J});
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #if 'vdw' in INTERACTION_FLAGS[I][J] */
+ gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset{I}+vdwjidx{J}A,
+ vdwparam+vdwioffset{I}+vdwjidx{J}B,
+ vdwparam+vdwioffset{I}+vdwjidx{J}C,
+ vdwparam+vdwioffset{I}+vdwjidx{J}D,
+ &c6_{I}{J},&c12_{I}{J});
+ /* #endif */
+ /* #endif */
+
+ /* #if 'table' in INTERACTION_FLAGS[I][J] */
+ /* Calculate table index by multiplying r with table scale and truncate to integer */
+ rt = _mm_mul_ps(r{I}{J},vftabscale);
+ vfitab = _mm_cvttps_epi32(rt);
+ vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
+ /* ## 3 tables, 4 bytes per point: multiply index by 12 */
+ vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+ /* #elif 'Table' in KERNEL_ELEC */
+ /* ## 1 table, 4 bytes per point: multiply index by 4 */
+ vfitab = _mm_slli_epi32(vfitab,2);
+ /* #elif 'Table' in KERNEL_VDW */
+ /* ## 2 tables, 4 bytes per point: multiply index by 8 */
+ vfitab = _mm_slli_epi32(vfitab,3);
+ /* #endif */
+ /* #endif */
+
+ /* ## ELECTROSTATIC INTERACTIONS */
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+
+ /* #if KERNEL_ELEC=='Coulomb' */
+
+ /* COULOMB ELECTROSTATICS */
+ velec = _mm_mul_ps(qq{I}{J},rinv{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #if 'Force' in KERNEL_VF */
+ felec = _mm_mul_ps(velec,rinvsq{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+2 */
+ /* #endif */
+
+ /* #elif KERNEL_ELEC=='ReactionField' */
+
+ /* REACTION-FIELD ELECTROSTATICS */
+ /* #if 'Potential' in KERNEL_VF */
+ velec = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_add_ps(rinv{I}{J},_mm_mul_ps(krf,rsq{I}{J})),crf));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #endif */
+ /* #if 'Force' in KERNEL_VF */
+ felec = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_mul_ps(rinv{I}{J},rinvsq{I}{J}),krf2));
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+
+ /* #elif KERNEL_ELEC=='GeneralizedBorn' */
+
+ /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+ isaprod = _mm_mul_ps(isai{I},isaj{J});
+ gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+{J},dvda+jnrB+{J},dvda+jnrC+{J},dvda+jnrD+{J});
+ /* #define INNERFLOPS INNERFLOPS+5 */
+
+ /* Calculate generalized born table index - this is a separate table from the normal one,
+ * but we use the same procedure by multiplying r with scale and truncating to integer.
+ */
+ rt = _mm_mul_ps(r{I}{J},gbscale);
+ gbitab = _mm_cvttps_epi32(rt);
+ gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+ gbitab = _mm_slli_epi32(gbitab,2);
+
+ Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+ F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+ G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+ H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(gbeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+ VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+ vgb = _mm_mul_ps(gbqqfactor,VV);
+ /* #define INNERFLOPS INNERFLOPS+10 */
+
+ /* #if 'Force' in KERNEL_VF */
+ FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+ dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+ _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
+ /* #define INNERFLOPS INNERFLOPS+13 */
+ /* #endif */
+ velec = _mm_mul_ps(qq{I}{J},rinv{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #if 'Force' in KERNEL_VF */
+ felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv{I}{J}),fgb),rinv{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+
+ /* #elif KERNEL_ELEC=='Ewald' */
+ /* EWALD ELECTROSTATICS */
+
+ /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+ ewrt = _mm_mul_ps(r{I}{J},ewtabscale);
+ ewitab = _mm_cvttps_epi32(ewrt);
+ eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
+ ewitab = _mm_slli_epi32(ewitab,2);
+ ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+ ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+ ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+ ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+ _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+ felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+ /* #define INNERFLOPS INNERFLOPS+2 */
+ /* #if KERNEL_MOD_ELEC=='PotentialShift' */
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_sub_ps(rinv{I}{J},sh_ewald),velec));
+ /* #define INNERFLOPS INNERFLOPS+7 */
+ /* #else */
+ velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+ velec = _mm_mul_ps(qq{I}{J},_mm_sub_ps(rinv{I}{J},velec));
+ /* #define INNERFLOPS INNERFLOPS+6 */
+ /* #endif */
+ /* #if 'Force' in KERNEL_VF */
+ felec = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+ /* #elif KERNEL_VF=='Force' */
+ gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+ ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+ &ewtabF,&ewtabFn);
+ felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+ felec = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
+ /* #define INNERFLOPS INNERFLOPS+7 */
+ /* #endif */
+
+ /* #elif KERNEL_ELEC=='CubicSplineTable' */
+
+ /* CUBIC SPLINE TABLE ELECTROSTATICS */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #if 'Potential' in KERNEL_VF */
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ velec = _mm_mul_ps(qq{I}{J},VV);
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+ /* #if 'Force' in KERNEL_VF */
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq{I}{J},FF),_mm_mul_ps(vftabscale,rinv{I}{J})));
+ /* #define INNERFLOPS INNERFLOPS+7 */
+ /* #endif */
+ /* #endif */
+ /* ## End of check for electrostatics interaction forms */
+ /* #endif */
+ /* ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
+
+ /* #if 'vdw' in INTERACTION_FLAGS[I][J] */
+
+ /* #if KERNEL_VDW=='LennardJones' */
+
+ /* LENNARD-JONES DISPERSION/REPULSION */
+
+ rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+2 */
+ /* #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+ vvdw6 = _mm_mul_ps(c6_{I}{J},rinvsix);
+ vvdw12 = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #if KERNEL_MOD_VDW=='PotentialShift' */
+ vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+ _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
+ /* #define INNERFLOPS INNERFLOPS+8 */
+ /* #else */
+ vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+ /* ## Check for force inside potential check, i.e. this means we already did the potential part */
+ /* #if 'Force' in KERNEL_VF */
+ fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
+ /* #define INNERFLOPS INNERFLOPS+2 */
+ /* #endif */
+ /* #elif KERNEL_VF=='Force' */
+ /* ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
+ fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_ps(rinvsix,rinvsq{I}{J}));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #endif */
+
+ /* #elif KERNEL_VDW=='CubicSplineTable' */
+
+ /* CUBIC SPLINE TABLE DISPERSION */
+ /* #if 'Table' in KERNEL_ELEC */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ /* #endif */
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #if 'Potential' in KERNEL_VF */
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw6 = _mm_mul_ps(c6_{I}{J},VV);
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+ /* #if 'Force' in KERNEL_VF */
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw6 = _mm_mul_ps(c6_{I}{J},FF);
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #endif */
+
+ /* CUBIC SPLINE TABLE REPULSION */
+ vfitab = _mm_add_epi32(vfitab,ifour);
+ Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+ F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+ G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+ H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ Heps = _mm_mul_ps(vfeps,H);
+ Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #if 'Potential' in KERNEL_VF */
+ VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+ vvdw12 = _mm_mul_ps(c12_{I}{J},VV);
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+ /* #if 'Force' in KERNEL_VF */
+ FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+ fvdw12 = _mm_mul_ps(c12_{I}{J},FF);
+ /* #define INNERFLOPS INNERFLOPS+5 */
+ /* #endif */
+ /* #if 'Potential' in KERNEL_VF */
+ vvdw = _mm_add_ps(vvdw12,vvdw6);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #if 'Force' in KERNEL_VF */
+ fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #endif */
+ /* #endif */
+ /* ## End of check for vdw interaction forms */
+ /* #endif */
+ /* ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
+
+ /* #if 'switch' in INTERACTION_FLAGS[I][J] */
+ d = _mm_sub_ps(r{I}{J},rswitch);
+ d = _mm_max_ps(d,_mm_setzero_ps());
+ d2 = _mm_mul_ps(d,d);
+ sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+ /* #define INNERFLOPS INNERFLOPS+10 */
+
+ /* #if 'Force' in KERNEL_VF */
+ dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+ /* #define INNERFLOPS INNERFLOPS+5 */
+ /* #endif */
+
+ /* Evaluate switch function */
+ /* #if 'Force' in KERNEL_VF */
+ /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+ felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(velec,dsw)) );
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #endif */
+ /* #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+ fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(vvdw,dsw)) );
+ /* #define INNERFLOPS INNERFLOPS+4 */
+ /* #endif */
+ /* #endif */
+ /* #if 'Potential' in KERNEL_VF */
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+ velec = _mm_mul_ps(velec,sw);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+ vvdw = _mm_mul_ps(vvdw,sw);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #endif */
+ /* #endif */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ cutoff_mask = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+
+ /* #if 'Potential' in KERNEL_VF */
+ /* Update potential sum for this i atom from the interaction with this j atom. */
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ velec = _mm_and_ps(velec,cutoff_mask);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #if ROUND == 'Epilogue' */
+ velec = _mm_andnot_ps(dummy_mask,velec);
+ /* #endif */
+ velecsum = _mm_add_ps(velecsum,velec);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #if KERNEL_ELEC=='GeneralizedBorn' */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ vgb = _mm_and_ps(vgb,cutoff_mask);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #if ROUND == 'Epilogue' */
+ vgb = _mm_andnot_ps(dummy_mask,vgb);
+ /* #endif */
+ vgbsum = _mm_add_ps(vgbsum,vgb);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #endif */
+ /* #if 'vdw' in INTERACTION_FLAGS[I][J] */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #if ROUND == 'Epilogue' */
+ vvdw = _mm_andnot_ps(dummy_mask,vvdw);
+ /* #endif */
+ vvdwsum = _mm_add_ps(vvdwsum,vvdw);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+ /* #endif */
+
+ /* #if 'Force' in KERNEL_VF */
+
+ /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
+ fscal = _mm_add_ps(felec,fvdw);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
+ fscal = felec;
+ /* #elif 'vdw' in INTERACTION_FLAGS[I][J] */
+ fscal = fvdw;
+ /* #endif */
+
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
+
+ /* #if ROUND == 'Epilogue' */
+ fscal = _mm_andnot_ps(dummy_mask,fscal);
+ /* #endif */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx{I}{J});
+ ty = _mm_mul_ps(fscal,dy{I}{J});
+ tz = _mm_mul_ps(fscal,dz{I}{J});
+
+ /* Update vectorial force */
+ fix{I} = _mm_add_ps(fix{I},tx);
+ fiy{I} = _mm_add_ps(fiy{I},ty);
+ fiz{I} = _mm_add_ps(fiz{I},tz);
+ /* #define INNERFLOPS INNERFLOPS+6 */
+
+ /* #if GEOMETRY_J == 'Particle' */
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ tx,ty,tz);
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #else */
+ fjx{J} = _mm_add_ps(fjx{J},tx);
+ fjy{J} = _mm_add_ps(fjy{J},ty);
+ fjz{J} = _mm_add_ps(fjz{J},tz);
+ /* #define INNERFLOPS INNERFLOPS+3 */
+ /* #endif */
+
+ /* #endif */
+
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* #if 0 ## This and next two lines is a hack to maintain indentation in template file */
+ {
+ /* #endif */
+ }
+ /* #endif */
+ /* ## End of check for the interaction being outside the cutoff */
+
+ /* #endfor */
+ /* ## End of loop over i-j interaction pairs */
+
+ /* #if GEOMETRY_J == 'Water3' */
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+ /* #define INNERFLOPS INNERFLOPS+9 */
+ /* #elif GEOMETRY_J == 'Water4' */
+ /* #if 0 in PARTICLES_J */
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+ f+j_coord_offsetC,f+j_coord_offsetD,
+ fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+ fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+ /* #define INNERFLOPS INNERFLOPS+12 */
+ /* #else */
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+ f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+ /* #define INNERFLOPS INNERFLOPS+9 */
+ /* #endif */
+ /* #endif */
+
+ /* Inner loop uses {INNERFLOPS} flops */
+ }
+
+ /* #endfor */
+
+ /* End of innermost loop */
+
+ /* #if 'Force' in KERNEL_VF */
+ /* #if GEOMETRY_I == 'Particle' */
+ gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+ f+i_coord_offset,fshift+i_shift_offset);
+ /* #define OUTERFLOPS OUTERFLOPS+6 */
+ /* #elif GEOMETRY_I == 'Water3' */
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+ f+i_coord_offset,fshift+i_shift_offset);
+ /* #define OUTERFLOPS OUTERFLOPS+18 */
+ /* #elif GEOMETRY_I == 'Water4' */
+ /* #if 0 in PARTICLES_I */
+ gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset,fshift+i_shift_offset);
+ /* #define OUTERFLOPS OUTERFLOPS+24 */
+ /* #else */
+ gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+ f+i_coord_offset+DIM,fshift+i_shift_offset);
+ /* #define OUTERFLOPS OUTERFLOPS+18 */
+ /* #endif */
+ /* #endif */
+ /* #endif */
+
+ /* #if 'Potential' in KERNEL_VF */
+ ggid = gid[iidx];
+ /* Update potential energies */
+ /* #if KERNEL_ELEC != 'None' */
+ gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+ /* #define OUTERFLOPS OUTERFLOPS+1 */
+ /* #endif */
+ /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+ gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+ /* #define OUTERFLOPS OUTERFLOPS+1 */
+ /* #endif */
+ /* #if KERNEL_VDW != 'None' */
+ gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+ /* #define OUTERFLOPS OUTERFLOPS+1 */
+ /* #endif */
+ /* #endif */
+ /* #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai{I},isai{I}));
+ gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+ /* #endif */
+
+ /* Increment number of inner iterations */
+ inneriter += j_index_end - j_index_start;
+
+ /* Outer loop uses {OUTERFLOPS} flops */
+ }
+
+ /* Increment number of outer iterations */
+ outeriter += nri;
+
+ /* Update outer/inner flops */
+ /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
+ /* ## primitive and replaces aggressively even in strings inside these directives, we need to */
+ /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source. */
+ /* #if GEOMETRY_I == 'Water3' */
+ /* #define ISUFFIX '_W3' */
+ /* #elif GEOMETRY_I == 'Water4' */
+ /* #define ISUFFIX '_W4' */
+ /* #else */
+ /* #define ISUFFIX '' */
+ /* #endif */
+ /* #if GEOMETRY_J == 'Water3' */
+ /* #define JSUFFIX 'W3' */
+ /* #elif GEOMETRY_J == 'Water4' */
+ /* #define JSUFFIX 'W4' */
+ /* #else */
+ /* #define JSUFFIX '' */
+ /* #endif */
+ /* #if 'PotentialAndForce' in KERNEL_VF */
+ /* #define VFSUFFIX '_VF' */
+ /* #elif 'Potential' in KERNEL_VF */
+ /* #define VFSUFFIX '_V' */
+ /* #else */
+ /* #define VFSUFFIX '_F' */
+ /* #endif */
+
+ /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+ /* #elif KERNEL_ELEC != 'None' */
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+ /* #else */
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+ /* #endif */
+}
/* Different default (c) and accelerated interaction-specific kernels */
#include "nb_kernel_c/nb_kernel_c.h"
+/* Temporary enabler until we add the AVX kernels */
+#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) || (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) || (defined GMX_CPU_ACCELERATION_X86_AVX_256)
+# define GMX_CPU_ACCELERATION_X86_SSE2
+#endif
+
+#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+# include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
+#endif
+
+
#ifdef GMX_THREAD_MPI
static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
#endif
/* Add the generic kernels to the structure stored statically in nb_kernel.c */
nb_kernel_list_add_kernels(kernellist_c,kernellist_c_size);
- if(fr->use_cpu_acceleration==TRUE)
+ if(!(fr!=NULL && fr->use_cpu_acceleration==FALSE))
{
/* Add interaction-specific kernels for different architectures */
+#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+ nb_kernel_list_add_kernels(kernellist_sse2_single,kernellist_sse2_single_size);
+#endif
; /* empty statement to avoid a completely empty block */
}
}
const char * other;
const char * vf;
- const char * arch[] =
+ struct
{
- "c"
+ const char * arch;
+ int simd_padding_width;
+ }
+ arch_and_padding[] =
+ {
+#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+ { "sse2_single", 4 },
+#endif
+ { "c", 1 },
};
- int narch = asize(arch);
+ int narch = asize(arch_and_padding);
int i;
if(nonbonded_setup_done==FALSE)
{
nl->kernelptr_vf = gmx_nb_free_energy_kernel;
nl->kernelptr_f = gmx_nb_free_energy_kernel;
+ nl->simd_padding_width = 1;
}
else if(!gmx_strcasecmp_min(geom,"CG-CG"))
{
nl->kernelptr_vf = gmx_nb_generic_cg_kernel;
nl->kernelptr_f = gmx_nb_generic_cg_kernel;
+ nl->simd_padding_width = 1;
}
else
{
for(i=0;i<narch && nl->kernelptr_vf==NULL ;i++)
{
- nl->kernelptr_vf = nb_kernel_list_findkernel(log,arch[i],elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+ nl->kernelptr_vf = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+ nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
}
for(i=0;i<narch && nl->kernelptr_f==NULL ;i++)
{
- nl->kernelptr_f = nb_kernel_list_findkernel(log,arch[i],elec,elec_mod,vdw,vdw_mod,geom,other,"Force");
+ nl->kernelptr_f = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"Force");
+ nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
+
/* If there is not force-only optimized kernel, is there a potential & force one? */
if(nl->kernelptr_f == NULL)
{
- nl->kernelptr_f = nb_kernel_list_findkernel(NULL,arch[i],elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+ nl->kernelptr_f = nb_kernel_list_findkernel(NULL,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+ nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
}
}
-
+
/* Give up, pick a generic one instead */
if(nl->kernelptr_vf==NULL)
{
nl->kernelptr_vf = gmx_nb_generic_kernel;
nl->kernelptr_f = gmx_nb_generic_kernel;
+ nl->simd_padding_width = 1;
if(log)
{
fprintf(log,
double cycles_pmes;
gmx_bool bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
- if(MASTER(cr))
- {
- gmx_warning("New C kernels (and force-only) kernels are now enabled,\n"
- "but it will be another couple of days for SSE/AVX kernels.\n\n");
- }
-
#ifdef GMX_FAHCORE
/* Temporary addition for FAHCORE checkpointing */
int chkpt_ret;
nsearch = search_neighbours(fp,fr,x,box,top,groups,cr,nrnb,md,
lambda,dvdlambda,grppener,
- bFillGrid,bDoLongRangeNS);
+ bFillGrid,bDoLongRangeNS,TRUE);
if (debug)
fprintf(debug,"nsearch = %d\n",nsearch);
#define NOTEXCL(e,i,j) !(ISEXCL(e,i,j))
#endif
+static int
+round_up_to_simd_width(int length, int simd_width)
+{
+ int offset,newlength;
+
+ offset = (simd_width>0) ? length % simd_width : 0;
+
+ return (offset==0) ? length : length-offset+simd_width;
+}
/************************************************
*
* U T I L I T I E S F O R N S
nl->igeometry = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
}
+ /* This will also set the simd_padding_width field */
gmx_nonbonded_set_kernel_pointers( (i==0) ? log : NULL,nl);
/* maxnri is influenced by the number of shifts (maximum is 8)
if (nri >= 0)
{
+ /* Add elements up to padding. Since we allocate memory in units
+ * of the simd_padding width, we do not have to check for possible
+ * list reallocation here.
+ */
+ while((nlist->nrj % nlist->simd_padding_width)!=0)
+ {
+ /* Use -4 here, so we can write forces for 4 atoms before real data */
+ nlist->jjnr[nlist->nrj++]=-4;
+ }
nlist->jindex[nri+1] = nlist->nrj;
len=nlist->nrj - nlist->jindex[nri];
if (nlist->nrj >= nlist->maxnrj)
{
- nlist->maxnrj = over_alloc_small(nlist->nrj + 1);
+ nlist->maxnrj = round_up_to_simd_width(over_alloc_small(nlist->nrj + 1),nlist->simd_padding_width);
+
if (gmx_debug_at)
fprintf(debug,"Increasing %s nblist (ielec=%d,ivdw=%d,free=%d,igeometry=%d) j size to %d\n",
bLR ? "LR" : "SR",nlist->ielec,nlist->ivdw,nlist->free_energy,nlist->igeometry,nlist->maxnrj);
real *lambda,real *dvdlambda,
gmx_grppairener_t *grppener,
gmx_bool bFillGrid,
- gmx_bool bDoLongRangeNS)
+ gmx_bool bDoLongRangeNS,
+ gmx_bool bPadListsForKernels)
{
t_block *cgs=&(top->cgs);
rvec box_size,grid_x0,grid_x1;
snew(dvdl,efptNR);
init_neighbor_list(fp,fr,md->homenr);
search_neighbours(fp,fr,x,box,top,
- &mtop->groups,cr,&nrnb,md,lambda,dvdl,NULL,TRUE,FALSE);
+ &mtop->groups,cr,&nrnb,md,lambda,dvdl,NULL,TRUE,FALSE,FALSE);
if (debug)
dump_nblist(debug,cr,fr,0);